diff --git a/.github/workflows/build-sphinx.yml b/.github/workflows/build-sphinx.yml
index 31423357e3f3..05f5b2711e24 100644
--- a/.github/workflows/build-sphinx.yml
+++ b/.github/workflows/build-sphinx.yml
@@ -26,7 +26,7 @@ jobs:
     name: Build and Deploy Docs
 
     runs-on: ubuntu-22.04
-    timeout-minutes: 60
+    timeout-minutes: 90
 
     permissions:
       # Needed to cancel any previous runs that are not completed for a given workflow
diff --git a/.github/workflows/check-onemath.yaml b/.github/workflows/check-onemath.yaml
index bead55d5f495..3ad8ba1ee84e 100644
--- a/.github/workflows/check-onemath.yaml
+++ b/.github/workflows/check-onemath.yaml
@@ -74,7 +74,7 @@ jobs:
         os: [ubuntu-22.04] # windows-2022 - no DFT support for Windows in oneMKL
 
     runs-on: ${{ matrix.os }}
-    timeout-minutes: 60
+    timeout-minutes: 120
 
     defaults:
       run:
@@ -133,6 +133,14 @@ jobs:
         if: env.rerun-tests-on-failure != 'true'
         run: |
           python -m pytest -ra --pyargs dpnp.tests
+        env:
+          SKIP_TENSOR_TESTS: 1
+          SYCL_CACHE_PERSISTENT: 1
+
+      - name: Run tensor tests
+        if: env.rerun-tests-on-failure != 'true'
+        run: |
+          python -m pytest -ra --pyargs dpnp.tests.tensor
         env:
           SYCL_CACHE_PERSISTENT: 1
 
@@ -150,6 +158,24 @@ jobs:
             mamba activate ${{ env.test-env-name }}
 
             python -m pytest -ra --pyargs dpnp.tests
+        env:
+          SKIP_TENSOR_TESTS: 1
+          SYCL_CACHE_PERSISTENT: 1
+
+      - name: ReRun tensor tests on Linux
+        if: env.rerun-tests-on-failure == 'true'
+        id: run_tensor_tests
+        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        with:
+          timeout_minutes: ${{ env.rerun-tests-timeout }}
+          max_attempts: ${{ env.rerun-tests-max-attempts }}
+          retry_on: any
+          command: |
+            . $CONDA/etc/profile.d/conda.sh
+            . $CONDA/etc/profile.d/mamba.sh
+            mamba activate ${{ env.test-env-name }}
+
+            python -m pytest -ra --pyargs dpnp.tests.tensor
         env:
           SYCL_CACHE_PERSISTENT: 1
 
@@ -239,6 +265,14 @@ jobs:
         if: env.rerun-tests-on-failure != 'true'
         run: |
           python -m pytest -ra --pyargs dpnp.tests
+        env:
+          SKIP_TENSOR_TESTS: 1
+          SYCL_CACHE_PERSISTENT: 1
+
+      - name: Run tensor tests
+        if: env.rerun-tests-on-failure != 'true'
+        run: |
+          python -m pytest -ra --pyargs dpnp.tests.tensor
         env:
           SYCL_CACHE_PERSISTENT: 1
 
@@ -256,5 +290,23 @@ jobs:
             mamba activate ${{ env.test-env-name }}
 
             python -m pytest -ra --pyargs dpnp.tests
+        env:
+          SKIP_TENSOR_TESTS: 1
+          SYCL_CACHE_PERSISTENT: 1
+
+      - name: ReRun tensor tests on Linux
+        if: env.rerun-tests-on-failure == 'true'
+        id: run_tensor_tests_branch
+        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        with:
+          timeout_minutes: ${{ env.rerun-tests-timeout }}
+          max_attempts: ${{ env.rerun-tests-max-attempts }}
+          retry_on: any
+          command: |
+            . $CONDA/etc/profile.d/conda.sh
+            . $CONDA/etc/profile.d/mamba.sh
+            mamba activate ${{ env.test-env-name }}
+
+            python -m pytest -ra --pyargs dpnp.tests.tensor
         env:
           SYCL_CACHE_PERSISTENT: 1
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 886204654a98..afd34ee00543 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -37,7 +37,7 @@ jobs:
       actions: write
 
     runs-on: ${{ matrix.os }}
-    timeout-minutes: 60
+    timeout-minutes: 90
 
     defaults:
       run:
@@ -220,6 +220,7 @@ jobs:
       - name: Run tests
         if: env.rerun-tests-on-failure != 'true'
         run: |
+          export SKIP_TENSOR_TESTS=1
           if [[ "${{ matrix.python }}" == "${{ env.python-ver-test-all-dtypes }}" ]]; then
             export DPNP_TEST_ALL_INT_TYPES=1
             python -m pytest -ra --pyargs ${{ env.package-name }}.tests
@@ -239,6 +240,7 @@ jobs:
             . $CONDA/etc/profile.d/conda.sh
             . $CONDA/etc/profile.d/mamba.sh
             mamba activate ${{ env.test-env-name }}
+            export SKIP_TENSOR_TESTS=1
 
             if [[ "${{ matrix.python }}" == "${{ env.python-ver-test-all-dtypes }}" ]]; then
               export DPNP_TEST_ALL_INT_TYPES=1
@@ -247,6 +249,26 @@ jobs:
               python -m pytest -n auto -ra --pyargs ${{ env.package-name }}.tests
             fi
 
+      - name: Run tensor tests
+        if: env.rerun-tests-on-failure != 'true'
+        run: |
+          python -m pytest -n auto -ra --pyargs dpnp.tests.tensor
+
+      - name: Run tensor tests
+        if: env.rerun-tests-on-failure == 'true'
+        id: run_tests_tensor_linux
+        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        with:
+          timeout_minutes: ${{ env.rerun-tests-timeout }}
+          max_attempts: ${{ env.rerun-tests-max-attempts }}
+          retry_on: any
+          command: |
+            . $CONDA/etc/profile.d/conda.sh
+            . $CONDA/etc/profile.d/mamba.sh
+            mamba activate ${{ env.test-env-name }}
+
+            python -m pytest -n auto -ra --pyargs dpnp.tests.tensor
+
   test_windows:
     name: Test
 
@@ -382,6 +404,7 @@ jobs:
         if: env.rerun-tests-on-failure != 'true'
         shell: pwsh
         run: |
+          $env:SKIP_TENSOR_TESTS=1
           if (${{ matrix.python }} -eq ${{ env.python-ver-test-all-dtypes }}) {
             $env:DPNP_TEST_ALL_INT_TYPES=1
             python -m pytest -ra --pyargs ${{ env.package-name }}.tests
@@ -399,6 +422,7 @@ jobs:
           retry_on: any
           shell: pwsh
           command: |
+            $env:SKIP_TENSOR_TESTS=1
             if ( ${{ matrix.python }} -eq ${{ env.python-ver-test-all-dtypes }} ) {
               $env:DPNP_TEST_ALL_INT_TYPES=1
               python -m pytest -ra --pyargs ${{ env.package-name }}.tests
@@ -406,6 +430,24 @@ jobs:
               python -m pytest -n auto -ra --pyargs ${{ env.package-name }}.tests
             }
 
+      - name: Run tensor tests
+        if: env.rerun-tests-on-failure != 'true'
+        shell: pwsh
+        run: |
+          python -m pytest -n auto -ra --pyargs dpnp.tests.tensor
+
+      - name: Run tensor tests
+        if: env.rerun-tests-on-failure == 'true'
+        id: run_tests_tensor_win
+        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
+        with:
+          timeout_minutes: ${{ env.rerun-tests-timeout }}
+          max_attempts: ${{ env.rerun-tests-max-attempts }}
+          retry_on: any
+          shell: pwsh
+          command: |
+            python -m pytest -n auto -ra --pyargs dpnp.tests.tensor
+
   upload:
     name: Upload
 
diff --git a/.github/workflows/generate_coverage.yaml b/.github/workflows/generate_coverage.yaml
index 5fd211e55a81..3d5d34531adf 100644
--- a/.github/workflows/generate_coverage.yaml
+++ b/.github/workflows/generate_coverage.yaml
@@ -11,7 +11,7 @@ jobs:
     name: Generate coverage and push to Coveralls.io
 
     runs-on: ubuntu-latest
-    timeout-minutes: 120
+    timeout-minutes: 150
 
     permissions:
       # Needed to cancel any previous runs that are not completed for a given workflow
@@ -122,7 +122,7 @@ jobs:
         uses: nick-fields/retry@ad984534de44a9489a53aefd81eb77f87c70dc60 # v4.0.0
         with:
           shell: bash
-          timeout_minutes: 60
+          timeout_minutes: 120
           max_attempts: 5
           retry_on: error
           command: |
@@ -130,6 +130,7 @@ jobs:
             conda activate coverage
             [ -f /opt/intel/oneapi/setvars.sh ] && source /opt/intel/oneapi/setvars.sh
             git clean -fxd
+            export SKIP_TENSOR_TESTS=1
             python scripts/gen_coverage.py
 
       - name: Total number of coverage attempts
diff --git a/.gitignore b/.gitignore
index 5d2725d3186f..f66bfbb3fdd8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,6 +28,7 @@ dpnp_pytest.*
 example3
 
 *dpnp_backend*
+dpnp/include/dpnp/tensor/*.h
 dpnp/**/*.cpython*.so
 dpnp/**/*.pyd
 *~
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f8aaae542ec5..bf659a351a57 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [0.20.0] - MM/DD/2026
 
+This release introduces a major architectural change: the Array API-compliant tensor implementation has been migrated from `dpctl.tensor` into `dpnp.tensor`, simplifying maintenance, reducing cross-project dependencies, and allows the tensor implementation to evolve within `dpnp`.
 This release changes the license from `BSD-2-Clause` to `BSD-3-Clause`.
 This release achieves `dpnp` compatibility with Python 3.14 and enables distributing `dpnp` packages with the latest Python version.
 Also, that release drops support for Python 3.9, making Python 3.10 the minimum required version.
@@ -28,6 +29,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Added implementation of `dpnp.isin` function [#2595](https://github.com/IntelPython/dpnp/pull/2595)
 * Added implementation of `dpnp.scipy.linalg.lu` (SciPy-compatible) [#2787](https://github.com/IntelPython/dpnp/pull/2787)
 * Added support for ndarray subclassing via `dpnp.ndarray.view` method with `type` parameter [#2815](https://github.com/IntelPython/dpnp/issues/2815)
+* Migrated tensor implementation from `dpctl.tensor` into `dpnp.tensor`, making `dpnp` the primary owner of the Array API-compliant tensor layer [#2856](https://github.com/IntelPython/dpnp/pull/2856)
 
 ### Changed
 
@@ -84,6 +86,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Resolved an issue with strides calculation in `dpnp.diagonal` to return correct values for empty diagonals [#2814](https://github.com/IntelPython/dpnp/pull/2814)
 * Fixed test tolerance issues for float16 intermediate precision that became visible when testing against conda-forge's NumPy [#2828](https://github.com/IntelPython/dpnp/pull/2828)
 * Ensured device aware dtype handling in `dpnp.identity` and `dpnp.gradient` [#2835](https://github.com/IntelPython/dpnp/pull/2835)
+* Fixed `dpnp.tensor.round` to use device-aware output dtype for boolean input [#2851](https://github.com/IntelPython/dpnp/pull/2851)
 
 ### Security
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 129bf1d87c25..b5c1068c1677 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,12 +37,23 @@ project(
 )
 
 option(DPNP_GENERATE_COVERAGE "Enable build DPNP with coverage instrumentation" OFF)
+option(
+    DPNP_TENSOR_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS
+    "Build dpnp tensor pybind11 offloading extensions with coverage instrumentation"
+    OFF
+)
 option(DPNP_BACKEND_TESTS "Enable building of DPNP backend test suite" OFF)
 option(
     DPNP_WITH_REDIST
     "Build DPNP assuming DPC++ redistributable is installed into Python prefix"
     OFF
 )
+option(
+    DPNP_TENSOR_OFFLOAD_COMPRESS
+    "Build dpnp tensor using offload section compression feature of DPC++ to reduce \
+size of shared object with offloading sections"
+    OFF
+)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
@@ -106,7 +117,6 @@ find_package(Cython REQUIRED)
 
 find_package(Dpctl REQUIRED)
 message(STATUS "Dpctl_INCLUDE_DIR=" ${Dpctl_INCLUDE_DIR})
-message(STATUS "Dpctl_TENSOR_INCLUDE_DIR=" ${Dpctl_TENSOR_INCLUDE_DIR})
 
 option(DPNP_USE_ONEMATH "Build DPNP with oneMath" OFF)
 set(DPNP_TARGET_CUDA
diff --git a/doc/conf.py b/doc/conf.py
index 469e6d5f5353..57119eab5396 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -6,6 +6,7 @@
 # http://www.sphinx-doc.org/en/master/config
 
 from datetime import datetime
+from urllib.parse import urljoin
 
 from sphinx.ext.autodoc import FunctionDocumenter
 from sphinx.ext.napoleon import NumpyDocstring, docstring
@@ -231,6 +232,9 @@ def _can_document_member(member, *args, **kwargs):
 
 autosummary_generate = True
 
+_DPCTL_021_BASE = "https://intelpython.github.io/dpctl/0.21.1/"
+_DPCTL_021_INV = urljoin(_DPCTL_021_BASE, "objects.inv")
+
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
@@ -302,3 +306,65 @@ def _parse_returns_section_patched(self, section: str) -> list[str]:
 
 
 NumpyDocstring._parse_returns_section = _parse_returns_section_patched
+
+
+# TODO: Remove once dpnp.tensor docs are generated in dpnp
+def _load_dpctl_tensor_inventory(app):
+    """Load dpctl 0.21.1 inventory for dpnp.tensor fallback only."""
+    from sphinx.ext.intersphinx import fetch_inventory
+    from sphinx.util import logging
+
+    logger = logging.getLogger(__name__)
+
+    try:
+        inv = fetch_inventory(app, _DPCTL_021_BASE, _DPCTL_021_INV)
+    except Exception as exc:
+        logger.warning(
+            "Failed to load dpctl 0.21.1 inventory from %s: %s",
+            _DPCTL_021_INV,
+            exc,
+        )
+        inv = {}
+
+    app.builder.env._dpctl_tensor_021_inventory = inv
+
+
+# TODO: Remove once dpnp.tensor docs are generated in dpnp
+def _resolve_dpnp_tensor_refs(app, env, node, contnode):
+    """Resolve dpnp.tensor.* references to dpctl 0.21.1 documentation.
+
+    This temporary workaround is needed because dpnp.tensor documentation
+    is not generated yet, while the corresponding API is still documented
+    in dpctl 0.21.1.
+    """
+    from docutils import nodes as docutils_nodes
+
+    target = node.get("reftarget", "")
+    if not target.startswith("dpnp.tensor"):
+        return None
+
+    dpctl_target = target.replace("dpnp.tensor", "dpctl.tensor", 1)
+    dpctl_tensor_inv = getattr(env, "_dpctl_tensor_021_inventory", {})
+
+    for _objtype, objects in dpctl_tensor_inv.items():
+        if dpctl_target not in objects:
+            continue
+
+        item = objects[dpctl_target]
+        location = item.uri
+        if location.endswith("$"):
+            location = location[:-1] + dpctl_target
+
+        refuri = urljoin(_DPCTL_021_BASE, location)
+        newnode = docutils_nodes.reference(
+            "", "", internal=False, refuri=refuri
+        )
+        newnode += contnode.deepcopy()
+        return newnode
+
+    return None
+
+
+def setup(app):
+    app.connect("builder-inited", _load_dpctl_tensor_inventory, priority=400)
+    app.connect("missing-reference", _resolve_dpnp_tensor_refs, priority=400)
diff --git a/doc/index.rst b/doc/index.rst
index 38c12489636b..847680fc11d9 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -13,6 +13,7 @@ Data Parallel Extension for NumPy*
    overview
    quick_start_guide
    reference/index
+   tensor
 
 .. toctree::
    :maxdepth: 1
diff --git a/doc/reference/exceptions.rst b/doc/reference/exceptions.rst
index 8f459b9f3aaa..69980ac8d8c2 100644
--- a/doc/reference/exceptions.rst
+++ b/doc/reference/exceptions.rst
@@ -20,7 +20,7 @@ Exceptions
 .. exception:: DLPackCreationError
 
    Given when constructing DLPack capsule from either :class:`dpnp.ndarray` or
-   :class:`dpctl.tensor.usm_ndarray` based on a USM allocation
+   :class:`dpnp.tensor.usm_ndarray` based on a USM allocation
    on a partitioned SYCL device.
 
    .. rubric:: Examples
diff --git a/doc/tensor.rst b/doc/tensor.rst
new file mode 100644
index 000000000000..22a1812f38a3
--- /dev/null
+++ b/doc/tensor.rst
@@ -0,0 +1,70 @@
+.. _tensor:
+
+Tensor (``dpnp.tensor``)
+========================
+
+``dpnp.tensor`` provides a reference implementation of the
+`Python Array API <https://data-apis.org/array-api/latest/>`_ specification.
+The implementation uses data-parallel algorithms suitable for execution on
+accelerators, such as GPUs.
+
+It also provides the underlying Array API-compliant implementation
+used by ``dpnp``.
+
+``dpnp.tensor`` is written using C++ and
+`SYCL <https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html>`_
+and oneAPI extensions implemented in
+`Intel(R) oneAPI DPC++ compiler <https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compiler.html>`_.
+
+Design and Motivation
+---------------------
+
+The tensor implementation was originally developed as a standalone project and
+later integrated into the `dpctl <https://intelpython.github.io/dpctl/latest/index.html>`_
+library as ``dpctl.tensor``. It has since been migrated into ``dpnp``,
+making ``dpnp`` the primary owner and development location of the tensor implementation.
+
+This change simplifies maintenance, reduces cross-project
+dependencies, and enables independent development and release cycles.
+
+Relationship to ``dpnp.ndarray``
+--------------------------------
+
+:class:`dpnp.ndarray` is a high-level array object built on top of
+``dpnp.tensor.usm_ndarray``, storing array data in Unified Shared Memory
+(USM) allocated on a SYCL device. Most users interact with
+:class:`dpnp.ndarray` directly; ``dpnp.tensor.usm_ndarray`` may appear in error
+messages or type signatures when working with device placement or
+interoperability.
+
+Relationship to ``dpctl``
+-------------------------
+
+The migration of ``dpctl.tensor`` into ``dpnp.tensor`` does not replace
+`dpctl <https://intelpython.github.io/dpctl/latest/index.html>`_ itself.
+``dpctl`` remains responsible for device and queue management
+(:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`) as well as USM memory
+allocation. ``dpnp`` builds on top of these capabilities.
+
+Example
+-------
+
+.. code-block:: python
+
+    import dpnp
+    import dpnp.tensor as dpt
+
+    # Create a tensor array on the default device
+    x = dpt.asarray([1.0, 2.0, 3.0])
+
+    # dpnp.ndarray wraps the underlying usm_ndarray
+    a = dpnp.asarray([1.0, 2.0, 3.0])
+    assert isinstance(a.get_array(), dpt.usm_ndarray)
+
+.. note::
+
+   The ``dpnp.tensor`` API documentation will be added in a future release.
+
+   The current implementation remains compatible with the original
+   ``dpctl.tensor`` API. For the complete API reference, see the
+   `dpctl 0.21.1 tensor documentation <https://intelpython.github.io/dpctl/0.21.1/api_reference/dpctl/tensor.html>`_.
diff --git a/dpnp/CMakeLists.txt b/dpnp/CMakeLists.txt
index 6850b799735c..d7acf368bcd0 100644
--- a/dpnp/CMakeLists.txt
+++ b/dpnp/CMakeLists.txt
@@ -86,11 +86,96 @@ function(build_dpnp_cython_ext _trgt _src _dest)
     install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest})
 endfunction()
 
+function(build_dpnp_tensor_ext _trgt _src _dest)
+    set(options SYCL)
+    cmake_parse_arguments(BUILD_DPNP_TENSOR "${options}" "RELATIVE_PATH" "" ${ARGN})
+    add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src)
+    set(_cythonize_trgt "${_trgt}_cythonize_pyx")
+    python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
+    if(BUILD_DPNP_TENSOR_SYCL)
+        add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
+        target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int)
+        target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel)
+        if(DPNP_TENSOR_OFFLOAD_COMPRESS)
+            target_link_options(${_trgt} PRIVATE --offload-compress)
+        endif()
+        if(_dpnp_sycl_targets)
+            # make fat binary
+            target_compile_options(
+                ${_trgt}
+                PRIVATE ${_dpnp_sycl_target_compile_options}
+            )
+            target_link_options(${_trgt} PRIVATE ${_dpnp_sycl_target_link_options})
+        endif()
+    endif()
+    target_link_libraries(${_trgt} PRIVATE Python::NumPy)
+    if(DPNP_GENERATE_COVERAGE)
+        target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1)
+        if(BUILD_DPNP_TENSOR_SYCL)
+            target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer)
+        endif()
+    endif()
+    # Dpctl
+    target_include_directories(${_trgt} PRIVATE ${Dpctl_INCLUDE_DIR})
+    target_link_directories(${_trgt} PRIVATE ${Dpctl_INCLUDE_DIR}/..)
+    target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface)
+    set(_linker_options "LINKER:${DPNP_LDFLAGS}")
+    target_link_options(${_trgt} PRIVATE ${_linker_options})
+    get_filename_component(_name_wle ${_generated_src} NAME_WLE)
+    get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY)
+    set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h")
+    set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h")
+
+    # TODO: create separate folder inside build folder that contains only
+    #   headers related to this target and appropriate folder structure to
+    #   eliminate shadow dependencies
+    # Go up two levels to build root for "dpnp/tensor/_usmarray.h" resolution
+    get_filename_component(_parent_dir ${_generated_src_dir} DIRECTORY)
+    get_filename_component(_build_root ${_parent_dir} DIRECTORY)
+    # TODO: do not set directory if we did not generate header
+    target_include_directories(${_trgt} INTERFACE ${_build_root})
+    set(_rpath_value "$ORIGIN")
+    if(BUILD_DPNP_TENSOR_RELATIVE_PATH)
+        set(_rpath_value "${_rpath_value}/${BUILD_DPNP_TENSOR_RELATIVE_PATH}")
+    endif()
+    if(DPNP_WITH_REDIST)
+        set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..")
+    endif()
+    set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value})
+
+    install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest})
+    install(
+        FILES ${_generated_api_h}
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpnp/include/${_dest}
+        OPTIONAL
+    )
+    install(
+        FILES ${_generated_public_h}
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpnp/include/${_dest}
+        OPTIONAL
+    )
+    if(DPNP_GENERATE_COVERAGE)
+        get_filename_component(_original_src_dir ${_src} DIRECTORY)
+        file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir})
+        install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir})
+    endif()
+
+    # Create target with headers only, because python is managing all the
+    # library imports at runtime
+    set(_trgt_headers ${_trgt}_headers)
+    add_library(${_trgt_headers} INTERFACE)
+    add_dependencies(${_trgt_headers} ${_trgt})
+    get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES)
+    target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir})
+endfunction()
+
 function(build_dpnp_cython_ext_with_backend _trgt _src _dest)
     build_dpnp_cython_ext(${_trgt} ${_src} ${_dest})
     target_link_libraries(${_trgt} PRIVATE dpnp_backend_library)
 endfunction()
 
+add_subdirectory(tensor)
+
 add_subdirectory(backend)
 add_subdirectory(backend/extensions/blas)
 add_subdirectory(backend/extensions/fft)
diff --git a/dpnp/__init__.py b/dpnp/__init__.py
index 02420107972f..d2ea158d4d44 100644
--- a/dpnp/__init__.py
+++ b/dpnp/__init__.py
@@ -28,7 +28,6 @@
 
 import os
 import sys
-import warnings
 
 mypath = os.path.dirname(os.path.realpath(__file__))
 
@@ -61,10 +60,7 @@
                 [os.getenv("PATH", ""), dll_path]
             )
 
-# Borrowed from DPCTL
-with warnings.catch_warnings():
-    warnings.simplefilter("ignore", DeprecationWarning)
-    from dpctl.tensor import __array_api_version__, DLDeviceType
+from .tensor import __array_api_version__, DLDeviceType
 
 from .dpnp_array import dpnp_array as ndarray
 from .dpnp_array_api_info import __array_namespace_info__
diff --git a/dpnp/__main__.py b/dpnp/__main__.py
new file mode 100644
index 000000000000..1c9c652109ee
--- /dev/null
+++ b/dpnp/__main__.py
@@ -0,0 +1,78 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import argparse
+import importlib
+import os
+import os.path
+import sys
+
+
+def _dpnp_dir() -> str:
+    dpnp_dir = importlib.util.find_spec("dpnp").submodule_search_locations[0]
+    abs_dpnp_dir = os.path.abspath(dpnp_dir)
+    return abs_dpnp_dir
+
+
+def get_tensor_include_dir() -> str:
+    """Prints path to dpnp libtensor include directory"""
+    dpnp_dir = _dpnp_dir()
+    libtensor_dir = os.path.join(dpnp_dir, "tensor", "libtensor", "include")
+    return libtensor_dir
+
+
+def print_tensor_include_flags() -> None:
+    """Prints include flags for dpnp tensor library"""
+    libtensor_dir = get_tensor_include_dir()
+    print("-I " + libtensor_dir)
+
+
+def main() -> None:
+    """Main entry-point."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tensor-includes",
+        action="store_true",
+        help="Include flags for dpnp libtensor headers.",
+    )
+    parser.add_argument(
+        "--tensor-include-dir",
+        action="store_true",
+        help="Path to dpnp libtensor include directory.",
+    )
+    args = parser.parse_args()
+    if not sys.argv[1:]:
+        parser.print_help()
+    if args.tensor_includes:
+        print_tensor_include_flags()
+    if args.tensor_include_dir:
+        print(get_tensor_include_dir())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dpnp/backend/CMakeLists.txt b/dpnp/backend/CMakeLists.txt
index ddca557a08f4..433ab298d476 100644
--- a/dpnp/backend/CMakeLists.txt
+++ b/dpnp/backend/CMakeLists.txt
@@ -89,7 +89,6 @@ target_compile_definitions(${_trgt} PUBLIC PSTL_USE_PARALLEL_POLICIES=0)
 target_compile_definitions(${_trgt} PUBLIC ONEDPL_USE_PREDEFINED_POLICIES=0)
 
 target_include_directories(${_trgt} PUBLIC ${Dpctl_INCLUDE_DIR})
-target_include_directories(${_trgt} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
 
 target_link_directories(${_trgt} PUBLIC "${Dpctl_INCLUDE_DIR}/..")
 target_link_libraries(${_trgt} PUBLIC DPCTLSyclInterface)
diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index 5960dfcd8028..b4013d82eb40 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -39,6 +39,9 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
@@ -65,14 +68,20 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/blas/dot_common.hpp b/dpnp/backend/extensions/blas/dot_common.hpp
index 383804ff1718..d9c3ae7f1c87 100644
--- a/dpnp/backend/extensions/blas/dot_common.hpp
+++ b/dpnp/backend/extensions/blas/dot_common.hpp
@@ -29,6 +29,7 @@
 #pragma once
 
 #include <oneapi/mkl.hpp>
+
 #include <pybind11/pybind11.h>
 
 // dpctl tensor headers
diff --git a/dpnp/backend/extensions/blas/gemm.hpp b/dpnp/backend/extensions/blas/gemm.hpp
index 997d515f98a0..59a3d911d885 100644
--- a/dpnp/backend/extensions/blas/gemm.hpp
+++ b/dpnp/backend/extensions/blas/gemm.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::blas
 {
diff --git a/dpnp/backend/extensions/blas/gemv.hpp b/dpnp/backend/extensions/blas/gemv.hpp
index afe0c6387aa9..6da71ed0964f 100644
--- a/dpnp/backend/extensions/blas/gemv.hpp
+++ b/dpnp/backend/extensions/blas/gemv.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::blas
 {
diff --git a/dpnp/backend/extensions/blas/syrk.hpp b/dpnp/backend/extensions/blas/syrk.hpp
index 580239b28008..f6cec189489a 100644
--- a/dpnp/backend/extensions/blas/syrk.hpp
+++ b/dpnp/backend/extensions/blas/syrk.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::blas
 {
diff --git a/dpnp/backend/extensions/common/ext/common.hpp b/dpnp/backend/extensions/common/ext/common.hpp
index f0ce1722bfb1..3c82fb10ec16 100644
--- a/dpnp/backend/extensions/common/ext/common.hpp
+++ b/dpnp/backend/extensions/common/ext/common.hpp
@@ -29,8 +29,10 @@
 #pragma once
 
 #include <complex>
+
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
 // dpctl tensor headers
diff --git a/dpnp/backend/extensions/common/ext/details/common_internal.hpp b/dpnp/backend/extensions/common/ext/details/common_internal.hpp
index 31d9671a0a43..8db72ce32318 100644
--- a/dpnp/backend/extensions/common/ext/details/common_internal.hpp
+++ b/dpnp/backend/extensions/common/ext/details/common_internal.hpp
@@ -30,9 +30,11 @@
 
 #include <algorithm>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
 #include "ext/common.hpp"
 #include "utils/type_dispatch.hpp"
-#include <pybind11/pybind11.h>
 
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
 
diff --git a/dpnp/backend/extensions/common/ext/validation_utils.hpp b/dpnp/backend/extensions/common/ext/validation_utils.hpp
index d41db8d5ca5a..03e0718d4450 100644
--- a/dpnp/backend/extensions/common/ext/validation_utils.hpp
+++ b/dpnp/backend/extensions/common/ext/validation_utils.hpp
@@ -32,7 +32,10 @@
 #include <unordered_map>
 #include <vector>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
+
+// dpctl tensor headers
+#include "utils/type_dispatch.hpp"
 
 namespace ext::validation
 {
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
index 6a29c9a33c5a..affe2fb5dc49 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions.hpp
@@ -30,16 +30,17 @@
 
 #include <cstddef>
 #include <exception>
+#include <iterator>
 #include <stdexcept>
 #include <utility>
 #include <vector>
 
-#include <sycl/sycl.hpp>
-
-#include "dpctl4pybind11.hpp"
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
 
 #include "elementwise_functions_type_utils.hpp"
 #include "simplify_iteration_space.hpp"
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
index 62f7584a3e0c..7300f938eabb 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.cpp
@@ -26,12 +26,13 @@
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
 
-#include "dpctl4pybind11.hpp"
-
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+
 #include <sycl/sycl.hpp>
 
+#include "dpnp4pybind11.hpp"
+
 #include "elementwise_functions_type_utils.hpp"
 
 // dpctl tensor headers
diff --git a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp
index 1bb6fedd7027..58fe43c01589 100644
--- a/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp
+++ b/dpnp/backend/extensions/elementwise_functions/elementwise_functions_type_utils.hpp
@@ -28,10 +28,10 @@
 
 #pragma once
 
-#include "dpctl4pybind11.hpp"
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 // dpctl tensor headers
 #include "utils/type_dispatch.hpp"
diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt
index f8f63dd7fd3b..9c452d94bd23 100644
--- a/dpnp/backend/extensions/fft/CMakeLists.txt
+++ b/dpnp/backend/extensions/fft/CMakeLists.txt
@@ -33,6 +33,9 @@ set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/fft_py.cpp)
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
@@ -57,11 +60,21 @@ set_target_properties(
     PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON
 )
 
+target_include_directories(
+    ${python_module_name}
+    PRIVATE
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
+)
+
 # treat below headers as system to suppress the warnings there during the build
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/fft/in_place.hpp b/dpnp/backend/extensions/fft/in_place.hpp
index 7eed11565b9e..bc35201b9b6e 100644
--- a/dpnp/backend/extensions/fft/in_place.hpp
+++ b/dpnp/backend/extensions/fft/in_place.hpp
@@ -28,10 +28,13 @@
 
 #pragma once
 
+#include <utility>
+#include <vector>
+
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::fft
 {
diff --git a/dpnp/backend/extensions/fft/in_place.tpp b/dpnp/backend/extensions/fft/in_place.tpp
index 4bc166b0e7ae..ace535284de6 100644
--- a/dpnp/backend/extensions/fft/in_place.tpp
+++ b/dpnp/backend/extensions/fft/in_place.tpp
@@ -27,15 +27,23 @@
 //*****************************************************************************
 
 #pragma once
+
 #include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <pybind11/pybind11.h>
 
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fft_utils.hpp"
+#include "in_place.hpp"
+
 // dpctl tensor headers
 #include "utils/output_validation.hpp"
 
diff --git a/dpnp/backend/extensions/fft/out_of_place.hpp b/dpnp/backend/extensions/fft/out_of_place.hpp
index 811a2bd6d1c4..55ca9383baaf 100644
--- a/dpnp/backend/extensions/fft/out_of_place.hpp
+++ b/dpnp/backend/extensions/fft/out_of_place.hpp
@@ -28,10 +28,13 @@
 
 #pragma once
 
+#include <utility>
+#include <vector>
+
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::fft
 {
diff --git a/dpnp/backend/extensions/fft/out_of_place.tpp b/dpnp/backend/extensions/fft/out_of_place.tpp
index ed5cd37df7f1..aada49c16bda 100644
--- a/dpnp/backend/extensions/fft/out_of_place.tpp
+++ b/dpnp/backend/extensions/fft/out_of_place.tpp
@@ -27,15 +27,25 @@
 //*****************************************************************************
 
 #pragma once
+
+#include <cstddef>
+#include <cstdint>
 #include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include <pybind11/pybind11.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fft_utils.hpp"
+#include "out_of_place.hpp"
+
 // dpctl tensor headers
 #include "utils/memory_overlap.hpp"
 #include "utils/output_validation.hpp"
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index e1bc34c9ae8b..ce800a87124c 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -36,6 +36,9 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
@@ -62,14 +65,21 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/indexing/choose.cpp b/dpnp/backend/extensions/indexing/choose.cpp
index 3b2df73f46ef..fafcbe1f2495 100644
--- a/dpnp/backend/extensions/indexing/choose.cpp
+++ b/dpnp/backend/extensions/indexing/choose.cpp
@@ -39,10 +39,11 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "dpnp4pybind11.hpp"
+
 #include "ext/common.hpp"
 #include "kernels/indexing/choose.hpp"
 
diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 6dee8abebeca..6c898df05236 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -56,6 +56,9 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
@@ -82,14 +85,20 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/lapack/geqrf.hpp b/dpnp/backend/extensions/lapack/geqrf.hpp
index 522006ace8ab..7be1fee971cf 100644
--- a/dpnp/backend/extensions/lapack/geqrf.hpp
+++ b/dpnp/backend/extensions/lapack/geqrf.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/gesv.hpp b/dpnp/backend/extensions/lapack/gesv.hpp
index d4198efae62e..a86039c9b72e 100644
--- a/dpnp/backend/extensions/lapack/gesv.hpp
+++ b/dpnp/backend/extensions/lapack/gesv.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/gesvd.hpp b/dpnp/backend/extensions/lapack/gesvd.hpp
index 116348e01d9f..b2fea5e47299 100644
--- a/dpnp/backend/extensions/lapack/gesvd.hpp
+++ b/dpnp/backend/extensions/lapack/gesvd.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/getrf.hpp b/dpnp/backend/extensions/lapack/getrf.hpp
index 24ec473f4dc7..ce6dc3e788b5 100644
--- a/dpnp/backend/extensions/lapack/getrf.hpp
+++ b/dpnp/backend/extensions/lapack/getrf.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/getri.hpp b/dpnp/backend/extensions/lapack/getri.hpp
index d8c8e58f3fcb..728af4a77e01 100644
--- a/dpnp/backend/extensions/lapack/getri.hpp
+++ b/dpnp/backend/extensions/lapack/getri.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/getrs.hpp b/dpnp/backend/extensions/lapack/getrs.hpp
index f5a47c69c9ec..2728b0c4e04a 100644
--- a/dpnp/backend/extensions/lapack/getrs.hpp
+++ b/dpnp/backend/extensions/lapack/getrs.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/heevd.cpp b/dpnp/backend/extensions/lapack/heevd.cpp
index 96d6a03e9b8e..ecad85f468ef 100644
--- a/dpnp/backend/extensions/lapack/heevd.cpp
+++ b/dpnp/backend/extensions/lapack/heevd.cpp
@@ -28,6 +28,7 @@
 
 #include <stdexcept>
 
+#include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "evd_common.hpp"
diff --git a/dpnp/backend/extensions/lapack/heevd_batch.cpp b/dpnp/backend/extensions/lapack/heevd_batch.cpp
index e8614498bd41..54521136127a 100644
--- a/dpnp/backend/extensions/lapack/heevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/heevd_batch.cpp
@@ -28,6 +28,7 @@
 
 #include <stdexcept>
 
+#include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "common_helpers.hpp"
diff --git a/dpnp/backend/extensions/lapack/orgqr.hpp b/dpnp/backend/extensions/lapack/orgqr.hpp
index 962edc7b668f..2502fe567a1f 100644
--- a/dpnp/backend/extensions/lapack/orgqr.hpp
+++ b/dpnp/backend/extensions/lapack/orgqr.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/potrf.hpp b/dpnp/backend/extensions/lapack/potrf.hpp
index d5df48a9ddf4..02faf2c04fde 100644
--- a/dpnp/backend/extensions/lapack/potrf.hpp
+++ b/dpnp/backend/extensions/lapack/potrf.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/lapack/syevd.cpp b/dpnp/backend/extensions/lapack/syevd.cpp
index 3ecd386299ac..60dae80e90c6 100644
--- a/dpnp/backend/extensions/lapack/syevd.cpp
+++ b/dpnp/backend/extensions/lapack/syevd.cpp
@@ -28,6 +28,7 @@
 
 #include <stdexcept>
 
+#include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "evd_common.hpp"
diff --git a/dpnp/backend/extensions/lapack/syevd_batch.cpp b/dpnp/backend/extensions/lapack/syevd_batch.cpp
index 13237d27a35c..884b6045f418 100644
--- a/dpnp/backend/extensions/lapack/syevd_batch.cpp
+++ b/dpnp/backend/extensions/lapack/syevd_batch.cpp
@@ -28,6 +28,7 @@
 
 #include <stdexcept>
 
+#include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include "common_helpers.hpp"
diff --git a/dpnp/backend/extensions/lapack/ungqr.hpp b/dpnp/backend/extensions/lapack/ungqr.hpp
index a149af1e24e1..8c9a36b3f4a6 100644
--- a/dpnp/backend/extensions/lapack/ungqr.hpp
+++ b/dpnp/backend/extensions/lapack/ungqr.hpp
@@ -31,7 +31,7 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include "dpnp4pybind11.hpp"
 
 namespace dpnp::extensions::lapack
 {
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 36786c8cbaf3..434d223de3ab 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -41,6 +41,9 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
@@ -67,14 +70,21 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/statistics/bincount.hpp b/dpnp/backend/extensions/statistics/bincount.hpp
index 5e42952349b0..2fc477e71edc 100644
--- a/dpnp/backend/extensions/statistics/bincount.hpp
+++ b/dpnp/backend/extensions/statistics/bincount.hpp
@@ -31,7 +31,8 @@
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
+
 #include "ext/dispatch_table.hpp"
 
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
diff --git a/dpnp/backend/extensions/statistics/histogram.cpp b/dpnp/backend/extensions/statistics/histogram.cpp
index 6d7da6836f60..afc5d9638f48 100644
--- a/dpnp/backend/extensions/statistics/histogram.cpp
+++ b/dpnp/backend/extensions/statistics/histogram.cpp
@@ -35,8 +35,9 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "dpnp4pybind11.hpp"
+
 // dpctl tensor headers
-#include "dpctl4pybind11.hpp"
 #include "utils/type_dispatch.hpp"
 
 #include "histogram.hpp"
@@ -50,7 +51,6 @@ using namespace ext::common;
 
 namespace
 {
-
 template <typename T, typename DataStorage>
 struct HistogramEdges
 {
diff --git a/dpnp/backend/extensions/statistics/histogram.hpp b/dpnp/backend/extensions/statistics/histogram.hpp
index c6a79ec24ee3..d04d8edbf02b 100644
--- a/dpnp/backend/extensions/statistics/histogram.hpp
+++ b/dpnp/backend/extensions/statistics/histogram.hpp
@@ -31,7 +31,9 @@
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
+
+// utils extension header
 #include "ext/dispatch_table.hpp"
 
 namespace statistics::histogram
diff --git a/dpnp/backend/extensions/statistics/histogram_common.cpp b/dpnp/backend/extensions/statistics/histogram_common.cpp
index 82afa2bd965d..252e1cd7c7cc 100644
--- a/dpnp/backend/extensions/statistics/histogram_common.cpp
+++ b/dpnp/backend/extensions/statistics/histogram_common.cpp
@@ -31,15 +31,18 @@
 #include <string>
 #include <vector>
 
-#include "dpctl4pybind11.hpp"
-#include "utils/type_dispatch.hpp"
-
 #include <pybind11/pybind11.h>
 
+#include "dpnp4pybind11.hpp"
+
 #include "histogram_common.hpp"
 
+// utils extension header
 #include "ext/validation_utils.hpp"
 
+// dpctl tensor headers
+#include "utils/type_dispatch.hpp"
+
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
 using dpctl::tensor::usm_ndarray;
 using dpctl_td_ns::typenum_t;
@@ -57,7 +60,6 @@ using ext::validation::name_of;
 
 namespace statistics::histogram
 {
-
 void validate(const usm_ndarray &sample,
               const std::optional<const dpctl::tensor::usm_ndarray> &bins,
               const std::optional<const dpctl::tensor::usm_ndarray> &weights,
diff --git a/dpnp/backend/extensions/statistics/histogram_common.hpp b/dpnp/backend/extensions/statistics/histogram_common.hpp
index 8091e8874d17..47fef11061f3 100644
--- a/dpnp/backend/extensions/statistics/histogram_common.hpp
+++ b/dpnp/backend/extensions/statistics/histogram_common.hpp
@@ -35,7 +35,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "ext/common.hpp"
 #include "kernels/statistics/histogram.hpp"
diff --git a/dpnp/backend/extensions/statistics/histogramdd.hpp b/dpnp/backend/extensions/statistics/histogramdd.hpp
index 327e9941dbc6..d7c46ae34b7d 100644
--- a/dpnp/backend/extensions/statistics/histogramdd.hpp
+++ b/dpnp/backend/extensions/statistics/histogramdd.hpp
@@ -31,7 +31,9 @@
 #include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
+
+// utils extension header
 #include "ext/dispatch_table.hpp"
 
 namespace statistics::histogram
diff --git a/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp b/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp
index b8f679f1030e..6c0e39a11a19 100644
--- a/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp
+++ b/dpnp/backend/extensions/statistics/sliding_dot_product1d.cpp
@@ -33,11 +33,14 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "dpnp4pybind11.hpp"
+
+// utils extension header
+#include "ext/common.hpp"
+
 // dpctl tensor headers
-#include "dpctl4pybind11.hpp"
 #include "utils/type_dispatch.hpp"
 
-#include "ext/common.hpp"
 #include "sliding_dot_product1d.hpp"
 #include "sliding_window1d.hpp"
 
@@ -51,7 +54,6 @@ using namespace ext::common;
 
 namespace
 {
-
 template <typename T>
 struct SlidingDotProductF
 {
diff --git a/dpnp/backend/extensions/statistics/sliding_window1d.cpp b/dpnp/backend/extensions/statistics/sliding_window1d.cpp
index 3ae66daa332b..81f8ae40104e 100644
--- a/dpnp/backend/extensions/statistics/sliding_window1d.cpp
+++ b/dpnp/backend/extensions/statistics/sliding_window1d.cpp
@@ -29,11 +29,16 @@
 #include <string>
 #include <vector>
 
-#include "dpctl4pybind11.hpp"
-#include "utils/type_dispatch.hpp"
 #include <pybind11/pybind11.h>
 
+#include "dpnp4pybind11.hpp"
+
+// utils extension header
 #include "ext/validation_utils.hpp"
+
+// dpctl tensor headers
+#include "utils/type_dispatch.hpp"
+
 #include "sliding_window1d.hpp"
 
 namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
@@ -48,7 +53,6 @@ using ext::validation::name_of;
 
 namespace statistics::sliding_window1d
 {
-
 void validate(const usm_ndarray &a,
               const usm_ndarray &v,
               const usm_ndarray &out,
@@ -89,5 +93,4 @@ void validate(const usm_ndarray &a,
             std::to_string(expected_output_size) + ")");
     }
 }
-
 } // namespace statistics::sliding_window1d
diff --git a/dpnp/backend/extensions/statistics/sliding_window1d.hpp b/dpnp/backend/extensions/statistics/sliding_window1d.hpp
index 329c96dfc1c6..a13c1f873e78 100644
--- a/dpnp/backend/extensions/statistics/sliding_window1d.hpp
+++ b/dpnp/backend/extensions/statistics/sliding_window1d.hpp
@@ -34,7 +34,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/statistics/sliding_window1d.hpp"
 
diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt
index ae6015e11d0f..2b01823d01f3 100644
--- a/dpnp/backend/extensions/ufunc/CMakeLists.txt
+++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt
@@ -67,6 +67,9 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
+
 if(WIN32)
     if(${CMAKE_VERSION} VERSION_LESS "3.27")
         # this is a work-around for target_link_options inserting option after -link option, cause
@@ -84,14 +87,21 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(_dpnp_sycl_targets)
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
index a0842f4ef259..761bd330a326 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/bitwise_count.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "bitwise_count.hpp"
 #include "kernels/elementwise_functions/bitwise_count.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
index 77452a6b777f..729fcb576c77 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/degrees.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "degrees.hpp"
 #include "kernels/elementwise_functions/degrees.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
index af87dcc85f53..1bb3859a39f4 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/divmod.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "divmod.hpp"
 #include "kernels/elementwise_functions/divmod.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
index 6f10e651fe25..c07989939b70 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/erf_funcs.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "erf_funcs.hpp"
 #include "kernels/elementwise_functions/erf.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
index d2b6ae24ac4b..f7c2183633af 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fabs.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "fabs.hpp"
 #include "kernels/elementwise_functions/fabs.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
index 0994afc7c738..43927eb93806 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/float_power.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "float_power.hpp"
 
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
index 5e1a9f33444b..9471feaf2166 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmax.cpp
@@ -28,9 +28,13 @@
 
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "fmax.hpp"
 #include "kernels/elementwise_functions/fmax.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
index c0e1db654317..8e279897f414 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmin.cpp
@@ -28,9 +28,13 @@
 
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "fmin.hpp"
 #include "kernels/elementwise_functions/fmin.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
index 5b83595b3f7c..83fb750b6907 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/fmod.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "fmod.hpp"
 #include "kernels/elementwise_functions/fmod.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
index 4439f1e76993..17e09f3ee816 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/frexp.cpp
@@ -31,9 +31,13 @@
 #include <utility>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "frexp.hpp"
 #include "kernels/elementwise_functions/frexp.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
index ec10504fa15e..0481365356ca 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/gcd.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "gcd.hpp"
 #include "kernels/elementwise_functions/gcd.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
index e3212de86f7f..62affd206420 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/heaviside.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "heaviside.hpp"
 #include "kernels/elementwise_functions/heaviside.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
index 4d120a56e837..53ded341b58b 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/i0.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "i0.hpp"
 #include "kernels/elementwise_functions/i0.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
index 8830569ce9cf..36dae50e7b2c 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/interpolate.cpp
@@ -35,12 +35,14 @@
 #include <utility>
 #include <vector>
 
-#include <sycl/sycl.hpp>
-
-#include "dpctl4pybind11.hpp"
+#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
 #include "kernels/elementwise_functions/interpolate.hpp"
 
 // dpctl tensor headers
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
index b8179feb9263..3025cbf16586 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/isclose.cpp
@@ -32,12 +32,14 @@
 #include <type_traits>
 #include <vector>
 
-#include <sycl/sycl.hpp>
-
-#include "dpctl4pybind11.hpp"
+#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
 #include "kernels/elementwise_functions/isclose.hpp"
 
 #include "../../elementwise_functions/simplify_iteration_space.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
index 4276ceb6b246..35138e903eac 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/lcm.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/lcm.hpp"
 #include "lcm.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
index 3e2c4f3d0149..44ef51726a6a 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/ldexp.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/ldexp.hpp"
 #include "ldexp.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
index 57c7c60ca9cf..e37f13b119d6 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/logaddexp2.cpp
@@ -28,9 +28,13 @@
 
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/logaddexp2.hpp"
 #include "logaddexp2.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
index f8aab23d5630..266103248521 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/modf.cpp
@@ -31,9 +31,13 @@
 #include <utility>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/modf.hpp"
 #include "modf.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp
index 2490f1921a98..c30d388f8afd 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/nan_to_num.cpp
@@ -38,11 +38,12 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "dpnp4pybind11.hpp"
+
 #include "kernels/elementwise_functions/nan_to_num.hpp"
 
 #include "../../elementwise_functions/simplify_iteration_space.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
index 7fc8ae5331dd..0a481fd33d11 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/radians.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/radians.hpp"
 #include "populate.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
index abd02e1e6282..87a911472db2 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/sinc.cpp
@@ -30,9 +30,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/sinc.hpp"
 #include "populate.hpp"
diff --git a/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp b/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
index 6e401c5388dd..4c14582f30ae 100644
--- a/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
+++ b/dpnp/backend/extensions/ufunc/elementwise_functions/spacing.cpp
@@ -29,9 +29,13 @@
 #include <type_traits>
 #include <vector>
 
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 
 #include "kernels/elementwise_functions/spacing.hpp"
 #include "populate.hpp"
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index 7165f7b926fb..05aa64e0d814 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -90,6 +90,9 @@ set(python_module_name _vm_impl)
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
+
 if(WIN32)
     if(${CMAKE_VERSION} VERSION_LESS "3.27")
         # this is a work-around for target_link_options inserting option after -link option, cause
@@ -107,14 +110,20 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/vm/abs.cpp b/dpnp/backend/extensions/vm/abs.cpp
index 133f3077ac43..1dc8143dd5ff 100644
--- a/dpnp/backend/extensions/vm/abs.cpp
+++ b/dpnp/backend/extensions/vm/abs.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "abs.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/acos.cpp b/dpnp/backend/extensions/vm/acos.cpp
index 0cb9bb32f4b8..15b4ce80cc3c 100644
--- a/dpnp/backend/extensions/vm/acos.cpp
+++ b/dpnp/backend/extensions/vm/acos.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "acos.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/acosh.cpp b/dpnp/backend/extensions/vm/acosh.cpp
index fa25ecf5cc1e..eed835b78e10 100644
--- a/dpnp/backend/extensions/vm/acosh.cpp
+++ b/dpnp/backend/extensions/vm/acosh.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "acosh.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/add.cpp b/dpnp/backend/extensions/vm/add.cpp
index 165671c93415..a58aac727cd1 100644
--- a/dpnp/backend/extensions/vm/add.cpp
+++ b/dpnp/backend/extensions/vm/add.cpp
@@ -36,7 +36,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "add.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/arg.cpp b/dpnp/backend/extensions/vm/arg.cpp
index e062f1f2ee06..c50c4a33dee1 100644
--- a/dpnp/backend/extensions/vm/arg.cpp
+++ b/dpnp/backend/extensions/vm/arg.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "arg.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/asin.cpp b/dpnp/backend/extensions/vm/asin.cpp
index 8a2e1c079ed8..5af7033fed21 100644
--- a/dpnp/backend/extensions/vm/asin.cpp
+++ b/dpnp/backend/extensions/vm/asin.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "asin.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/asinh.cpp b/dpnp/backend/extensions/vm/asinh.cpp
index 176bacdb92a8..5b0f8ed13106 100644
--- a/dpnp/backend/extensions/vm/asinh.cpp
+++ b/dpnp/backend/extensions/vm/asinh.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "asinh.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/atan.cpp b/dpnp/backend/extensions/vm/atan.cpp
index 21c8c8f1c9d5..2255000c1c4b 100644
--- a/dpnp/backend/extensions/vm/atan.cpp
+++ b/dpnp/backend/extensions/vm/atan.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "atan.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/atan2.cpp b/dpnp/backend/extensions/vm/atan2.cpp
index 1d4e5c333e68..bf29e2921a1d 100644
--- a/dpnp/backend/extensions/vm/atan2.cpp
+++ b/dpnp/backend/extensions/vm/atan2.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "atan2.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/atanh.cpp b/dpnp/backend/extensions/vm/atanh.cpp
index 7097fabf602f..9daab09980e6 100644
--- a/dpnp/backend/extensions/vm/atanh.cpp
+++ b/dpnp/backend/extensions/vm/atanh.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "atanh.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/cbrt.cpp b/dpnp/backend/extensions/vm/cbrt.cpp
index db3cdfcebd8d..34ff8dd913ac 100644
--- a/dpnp/backend/extensions/vm/cbrt.cpp
+++ b/dpnp/backend/extensions/vm/cbrt.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "cbrt.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/ceil.cpp b/dpnp/backend/extensions/vm/ceil.cpp
index 6f5aeba16f99..e76a30d28317 100644
--- a/dpnp/backend/extensions/vm/ceil.cpp
+++ b/dpnp/backend/extensions/vm/ceil.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "ceil.hpp"
 #include "common.hpp"
diff --git a/dpnp/backend/extensions/vm/common.hpp b/dpnp/backend/extensions/vm/common.hpp
index 325aba7fafd2..5d2631d5b556 100644
--- a/dpnp/backend/extensions/vm/common.hpp
+++ b/dpnp/backend/extensions/vm/common.hpp
@@ -34,10 +34,10 @@
 #include <vector>
 
 #include <oneapi/mkl.hpp>
+#include <pybind11/pybind11.h>
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
-#include <pybind11/pybind11.h>
+#include "dpnp4pybind11.hpp"
 
 // utils extension header
 #include "ext/common.hpp"
diff --git a/dpnp/backend/extensions/vm/conj.cpp b/dpnp/backend/extensions/vm/conj.cpp
index 36710104750a..f77020cf1d55 100644
--- a/dpnp/backend/extensions/vm/conj.cpp
+++ b/dpnp/backend/extensions/vm/conj.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "conj.hpp"
diff --git a/dpnp/backend/extensions/vm/copysign.cpp b/dpnp/backend/extensions/vm/copysign.cpp
index cd90abf65a06..15c0fceec413 100644
--- a/dpnp/backend/extensions/vm/copysign.cpp
+++ b/dpnp/backend/extensions/vm/copysign.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "copysign.hpp"
diff --git a/dpnp/backend/extensions/vm/cos.cpp b/dpnp/backend/extensions/vm/cos.cpp
index 76db72594763..7c9b0c35d6ca 100644
--- a/dpnp/backend/extensions/vm/cos.cpp
+++ b/dpnp/backend/extensions/vm/cos.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "cos.hpp"
diff --git a/dpnp/backend/extensions/vm/cosh.cpp b/dpnp/backend/extensions/vm/cosh.cpp
index 464410b1accc..a95c7075ba61 100644
--- a/dpnp/backend/extensions/vm/cosh.cpp
+++ b/dpnp/backend/extensions/vm/cosh.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "cosh.hpp"
diff --git a/dpnp/backend/extensions/vm/div.cpp b/dpnp/backend/extensions/vm/div.cpp
index ad96f9acf083..6e0cb4d0439f 100644
--- a/dpnp/backend/extensions/vm/div.cpp
+++ b/dpnp/backend/extensions/vm/div.cpp
@@ -36,7 +36,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "div.hpp"
diff --git a/dpnp/backend/extensions/vm/erf_funcs.cpp b/dpnp/backend/extensions/vm/erf_funcs.cpp
index 4e84403eb061..7be7f691edcf 100644
--- a/dpnp/backend/extensions/vm/erf_funcs.cpp
+++ b/dpnp/backend/extensions/vm/erf_funcs.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "erf_funcs.hpp"
diff --git a/dpnp/backend/extensions/vm/exp.cpp b/dpnp/backend/extensions/vm/exp.cpp
index acd265d191f7..31f50f36171d 100644
--- a/dpnp/backend/extensions/vm/exp.cpp
+++ b/dpnp/backend/extensions/vm/exp.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "exp.hpp"
diff --git a/dpnp/backend/extensions/vm/exp2.cpp b/dpnp/backend/extensions/vm/exp2.cpp
index 82c6c32fb6c5..41f18351fa7d 100644
--- a/dpnp/backend/extensions/vm/exp2.cpp
+++ b/dpnp/backend/extensions/vm/exp2.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "exp2.hpp"
diff --git a/dpnp/backend/extensions/vm/expm1.cpp b/dpnp/backend/extensions/vm/expm1.cpp
index 93cef7b3272d..37440cab9b0c 100644
--- a/dpnp/backend/extensions/vm/expm1.cpp
+++ b/dpnp/backend/extensions/vm/expm1.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "expm1.hpp"
diff --git a/dpnp/backend/extensions/vm/floor.cpp b/dpnp/backend/extensions/vm/floor.cpp
index fb1a86eda7bf..771d141e7f6a 100644
--- a/dpnp/backend/extensions/vm/floor.cpp
+++ b/dpnp/backend/extensions/vm/floor.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "floor.hpp"
diff --git a/dpnp/backend/extensions/vm/fmax.cpp b/dpnp/backend/extensions/vm/fmax.cpp
index 32786a3e8fc2..d01b3ef3dc42 100644
--- a/dpnp/backend/extensions/vm/fmax.cpp
+++ b/dpnp/backend/extensions/vm/fmax.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fmax.hpp"
diff --git a/dpnp/backend/extensions/vm/fmin.cpp b/dpnp/backend/extensions/vm/fmin.cpp
index d923b8c7ddfb..6fbebba556f8 100644
--- a/dpnp/backend/extensions/vm/fmin.cpp
+++ b/dpnp/backend/extensions/vm/fmin.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fmin.hpp"
diff --git a/dpnp/backend/extensions/vm/fmod.cpp b/dpnp/backend/extensions/vm/fmod.cpp
index 6c8a4ac705e4..1330453d6f84 100644
--- a/dpnp/backend/extensions/vm/fmod.cpp
+++ b/dpnp/backend/extensions/vm/fmod.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "fmod.hpp"
diff --git a/dpnp/backend/extensions/vm/hypot.cpp b/dpnp/backend/extensions/vm/hypot.cpp
index 92b7c78f8ad6..a9b3d3c12288 100644
--- a/dpnp/backend/extensions/vm/hypot.cpp
+++ b/dpnp/backend/extensions/vm/hypot.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "hypot.hpp"
diff --git a/dpnp/backend/extensions/vm/i0.cpp b/dpnp/backend/extensions/vm/i0.cpp
index 5db3ef9d9669..50f692ebd958 100644
--- a/dpnp/backend/extensions/vm/i0.cpp
+++ b/dpnp/backend/extensions/vm/i0.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "i0.hpp"
diff --git a/dpnp/backend/extensions/vm/inv.cpp b/dpnp/backend/extensions/vm/inv.cpp
index 1adeb1be23d0..eda08a6d0cd5 100644
--- a/dpnp/backend/extensions/vm/inv.cpp
+++ b/dpnp/backend/extensions/vm/inv.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "inv.hpp"
diff --git a/dpnp/backend/extensions/vm/ln.cpp b/dpnp/backend/extensions/vm/ln.cpp
index e60a0545005b..a5365e4d5a8b 100644
--- a/dpnp/backend/extensions/vm/ln.cpp
+++ b/dpnp/backend/extensions/vm/ln.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "ln.hpp"
diff --git a/dpnp/backend/extensions/vm/log10.cpp b/dpnp/backend/extensions/vm/log10.cpp
index d26ec57ab9ce..c04fb602f63d 100644
--- a/dpnp/backend/extensions/vm/log10.cpp
+++ b/dpnp/backend/extensions/vm/log10.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "log10.hpp"
diff --git a/dpnp/backend/extensions/vm/log1p.cpp b/dpnp/backend/extensions/vm/log1p.cpp
index 861804f8f6e0..04416bf37185 100644
--- a/dpnp/backend/extensions/vm/log1p.cpp
+++ b/dpnp/backend/extensions/vm/log1p.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "log1p.hpp"
diff --git a/dpnp/backend/extensions/vm/log2.cpp b/dpnp/backend/extensions/vm/log2.cpp
index e75e96c32fe9..752caa261977 100644
--- a/dpnp/backend/extensions/vm/log2.cpp
+++ b/dpnp/backend/extensions/vm/log2.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "log2.hpp"
diff --git a/dpnp/backend/extensions/vm/modf.cpp b/dpnp/backend/extensions/vm/modf.cpp
index ef68c79d8b42..418e4e44f7f7 100644
--- a/dpnp/backend/extensions/vm/modf.cpp
+++ b/dpnp/backend/extensions/vm/modf.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "modf.hpp"
diff --git a/dpnp/backend/extensions/vm/mul.cpp b/dpnp/backend/extensions/vm/mul.cpp
index 0c9cf7fb79cc..557cfb8882b3 100644
--- a/dpnp/backend/extensions/vm/mul.cpp
+++ b/dpnp/backend/extensions/vm/mul.cpp
@@ -36,7 +36,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "mul.hpp"
diff --git a/dpnp/backend/extensions/vm/nextafter.cpp b/dpnp/backend/extensions/vm/nextafter.cpp
index 59b205b3d62a..a8ff710bda77 100644
--- a/dpnp/backend/extensions/vm/nextafter.cpp
+++ b/dpnp/backend/extensions/vm/nextafter.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "nextafter.hpp"
diff --git a/dpnp/backend/extensions/vm/pow.cpp b/dpnp/backend/extensions/vm/pow.cpp
index 5969a4862730..f0db87d1ef48 100644
--- a/dpnp/backend/extensions/vm/pow.cpp
+++ b/dpnp/backend/extensions/vm/pow.cpp
@@ -36,7 +36,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "pow.hpp"
diff --git a/dpnp/backend/extensions/vm/rint.cpp b/dpnp/backend/extensions/vm/rint.cpp
index 41cd20a944a0..86931f259a04 100644
--- a/dpnp/backend/extensions/vm/rint.cpp
+++ b/dpnp/backend/extensions/vm/rint.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "rint.hpp"
diff --git a/dpnp/backend/extensions/vm/sin.cpp b/dpnp/backend/extensions/vm/sin.cpp
index 9263c3c4ffcf..7bb6ec321d2a 100644
--- a/dpnp/backend/extensions/vm/sin.cpp
+++ b/dpnp/backend/extensions/vm/sin.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sin.hpp"
diff --git a/dpnp/backend/extensions/vm/sinh.cpp b/dpnp/backend/extensions/vm/sinh.cpp
index a1bae13a5281..5c351afd3b82 100644
--- a/dpnp/backend/extensions/vm/sinh.cpp
+++ b/dpnp/backend/extensions/vm/sinh.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sinh.hpp"
diff --git a/dpnp/backend/extensions/vm/sqr.cpp b/dpnp/backend/extensions/vm/sqr.cpp
index 88c2e833b483..9d5cb8af5f2c 100644
--- a/dpnp/backend/extensions/vm/sqr.cpp
+++ b/dpnp/backend/extensions/vm/sqr.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sqr.hpp"
diff --git a/dpnp/backend/extensions/vm/sqrt.cpp b/dpnp/backend/extensions/vm/sqrt.cpp
index 98cf2eea9253..5ab3489c1288 100644
--- a/dpnp/backend/extensions/vm/sqrt.cpp
+++ b/dpnp/backend/extensions/vm/sqrt.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sqrt.hpp"
diff --git a/dpnp/backend/extensions/vm/sub.cpp b/dpnp/backend/extensions/vm/sub.cpp
index 5ee01f239c06..401588d4b65f 100644
--- a/dpnp/backend/extensions/vm/sub.cpp
+++ b/dpnp/backend/extensions/vm/sub.cpp
@@ -36,7 +36,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "sub.hpp"
diff --git a/dpnp/backend/extensions/vm/tan.cpp b/dpnp/backend/extensions/vm/tan.cpp
index 46555ebd0178..590320034934 100644
--- a/dpnp/backend/extensions/vm/tan.cpp
+++ b/dpnp/backend/extensions/vm/tan.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "tan.hpp"
diff --git a/dpnp/backend/extensions/vm/tanh.cpp b/dpnp/backend/extensions/vm/tanh.cpp
index 04d2febfac1d..8febd94f2ec8 100644
--- a/dpnp/backend/extensions/vm/tanh.cpp
+++ b/dpnp/backend/extensions/vm/tanh.cpp
@@ -35,7 +35,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "tanh.hpp"
diff --git a/dpnp/backend/extensions/vm/trunc.cpp b/dpnp/backend/extensions/vm/trunc.cpp
index c23a9a8180fb..4ec788ccf949 100644
--- a/dpnp/backend/extensions/vm/trunc.cpp
+++ b/dpnp/backend/extensions/vm/trunc.cpp
@@ -34,7 +34,10 @@
 #include <oneapi/mkl.hpp>
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpnp4pybind11.hpp"
 
 #include "common.hpp"
 #include "trunc.hpp"
diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt
index 0cebfe79b2de..9dac2df9d0df 100644
--- a/dpnp/backend/extensions/window/CMakeLists.txt
+++ b/dpnp/backend/extensions/window/CMakeLists.txt
@@ -36,6 +36,9 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+# Ensure Cython modules build first so _usmarray.h exists
+add_dependencies(${python_module_name} _usmarray)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
@@ -62,14 +65,21 @@ set_target_properties(
 
 target_include_directories(
     ${python_module_name}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common
+    PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../
+        ${CMAKE_CURRENT_SOURCE_DIR}/../common
+        ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+        ${CMAKE_SOURCE_DIR}/dpnp/tensor/libtensor/include
 )
 
 # treat below headers as system to suppress the warnings there during the build
 target_include_directories(
     ${python_module_name}
     SYSTEM
-    PRIVATE ${SYCL_INCLUDE_DIR} ${Dpctl_INCLUDE_DIRS} ${Dpctl_TENSOR_INCLUDE_DIR}
+    PRIVATE
+        ${SYCL_INCLUDE_DIR}
+        ${Dpctl_INCLUDE_DIRS}
+        ${CMAKE_BINARY_DIR} # For generated Cython headers
 )
 
 if(WIN32)
diff --git a/dpnp/backend/extensions/window/common.hpp b/dpnp/backend/extensions/window/common.hpp
index 9e7b1192e3a2..fcec281b3948 100644
--- a/dpnp/backend/extensions/window/common.hpp
+++ b/dpnp/backend/extensions/window/common.hpp
@@ -37,7 +37,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include "dpctl4pybind11.hpp"
+#include "dpnp4pybind11.hpp"
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
diff --git a/dpnp/backend/extensions/window/kaiser.hpp b/dpnp/backend/extensions/window/kaiser.hpp
index 4ba506620db2..9a088e700a2f 100644
--- a/dpnp/backend/extensions/window/kaiser.hpp
+++ b/dpnp/backend/extensions/window/kaiser.hpp
@@ -30,7 +30,7 @@
 
 #include <sycl/sycl.hpp>
 
-#include <dpctl4pybind11.hpp>
+#include <dpnp4pybind11.hpp>
 #include <pybind11/pybind11.h>
 
 namespace dpnp::extensions::window
diff --git a/dpnp/backend/include/dpnp4pybind11.hpp b/dpnp/backend/include/dpnp4pybind11.hpp
new file mode 100644
index 000000000000..8bc931a3ca1a
--- /dev/null
+++ b/dpnp/backend/include/dpnp4pybind11.hpp
@@ -0,0 +1,1328 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+// Include dpctl C-API headers
+#include "dpctl_capi.h"
+
+// Include generated Cython headers for usm_ndarray
+// (struct definition and constants only)
+#include "dpnp/tensor/_usmarray.h"
+#include "dpnp/tensor/_usmarray_api.h"
+
+#include <array>
+#include <cassert>
+#include <complex>
+#include <cstddef> // for std::size_t for C++ linkage
+#include <cstdint>
+#include <memory>
+#include <stddef.h> // for size_t for C linkage
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <pybind11/pybind11.h>
+
+#include <sycl/sycl.hpp>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace detail
+{
+// Lookup a type according to its size, and return a value corresponding to the
+// NumPy typenum.
+template <typename Concrete>
+constexpr int platform_typeid_lookup()
+{
+    return -1;
+}
+
+template <typename Concrete, typename T, typename... Ts, typename... Ints>
+constexpr int platform_typeid_lookup(int I, Ints... Is)
+{
+    return sizeof(Concrete) == sizeof(T)
+               ? I
+               : platform_typeid_lookup<Concrete, Ts...>(Is...);
+}
+
+class dpctl_capi
+{
+public:
+    // dpctl type objects
+    PyTypeObject *Py_SyclDeviceType_;
+    PyTypeObject *PySyclDeviceType_;
+    PyTypeObject *Py_SyclContextType_;
+    PyTypeObject *PySyclContextType_;
+    PyTypeObject *Py_SyclEventType_;
+    PyTypeObject *PySyclEventType_;
+    PyTypeObject *Py_SyclQueueType_;
+    PyTypeObject *PySyclQueueType_;
+    PyTypeObject *Py_MemoryType_;
+    PyTypeObject *PyMemoryUSMDeviceType_;
+    PyTypeObject *PyMemoryUSMSharedType_;
+    PyTypeObject *PyMemoryUSMHostType_;
+    PyTypeObject *PyUSMArrayType_;
+    PyTypeObject *PySyclProgramType_;
+    PyTypeObject *PySyclKernelType_;
+
+    DPCTLSyclDeviceRef (*SyclDevice_GetDeviceRef_)(PySyclDeviceObject *);
+    PySyclDeviceObject *(*SyclDevice_Make_)(DPCTLSyclDeviceRef);
+
+    DPCTLSyclContextRef (*SyclContext_GetContextRef_)(PySyclContextObject *);
+    PySyclContextObject *(*SyclContext_Make_)(DPCTLSyclContextRef);
+
+    DPCTLSyclEventRef (*SyclEvent_GetEventRef_)(PySyclEventObject *);
+    PySyclEventObject *(*SyclEvent_Make_)(DPCTLSyclEventRef);
+
+    DPCTLSyclQueueRef (*SyclQueue_GetQueueRef_)(PySyclQueueObject *);
+    PySyclQueueObject *(*SyclQueue_Make_)(DPCTLSyclQueueRef);
+
+    // memory
+    DPCTLSyclUSMRef (*Memory_GetUsmPointer_)(Py_MemoryObject *);
+    void *(*Memory_GetOpaquePointer_)(Py_MemoryObject *);
+    DPCTLSyclContextRef (*Memory_GetContextRef_)(Py_MemoryObject *);
+    DPCTLSyclQueueRef (*Memory_GetQueueRef_)(Py_MemoryObject *);
+    size_t (*Memory_GetNumBytes_)(Py_MemoryObject *);
+    PyObject *(*Memory_Make_)(DPCTLSyclUSMRef,
+                              size_t,
+                              DPCTLSyclQueueRef,
+                              PyObject *);
+
+    // program
+    DPCTLSyclKernelRef (*SyclKernel_GetKernelRef_)(PySyclKernelObject *);
+    PySyclKernelObject *(*SyclKernel_Make_)(DPCTLSyclKernelRef, const char *);
+
+    DPCTLSyclKernelBundleRef (*SyclProgram_GetKernelBundleRef_)(
+        PySyclProgramObject *);
+    PySyclProgramObject *(*SyclProgram_Make_)(DPCTLSyclKernelBundleRef);
+
+    int USM_ARRAY_C_CONTIGUOUS_;
+    int USM_ARRAY_F_CONTIGUOUS_;
+    int USM_ARRAY_WRITABLE_;
+    int UAR_BOOL_, UAR_BYTE_, UAR_UBYTE_, UAR_SHORT_, UAR_USHORT_, UAR_INT_,
+        UAR_UINT_, UAR_LONG_, UAR_ULONG_, UAR_LONGLONG_, UAR_ULONGLONG_,
+        UAR_FLOAT_, UAR_DOUBLE_, UAR_CFLOAT_, UAR_CDOUBLE_, UAR_TYPE_SENTINEL_,
+        UAR_HALF_;
+    int UAR_INT8_, UAR_UINT8_, UAR_INT16_, UAR_UINT16_, UAR_INT32_, UAR_UINT32_,
+        UAR_INT64_, UAR_UINT64_;
+
+    bool PySyclDevice_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclDeviceType_) != 0;
+    }
+    bool PySyclContext_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclContextType_) != 0;
+    }
+    bool PySyclEvent_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclEventType_) != 0;
+    }
+    bool PySyclQueue_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclQueueType_) != 0;
+    }
+    bool PySyclKernel_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclKernelType_) != 0;
+    }
+    bool PySyclProgram_Check_(PyObject *obj) const
+    {
+        return PyObject_TypeCheck(obj, PySyclProgramType_) != 0;
+    }
+
+    ~dpctl_capi()
+    {
+        as_usm_memory_.reset();
+        default_usm_ndarray_.reset();
+        default_usm_memory_.reset();
+        default_sycl_queue_.reset();
+    };
+
+    static auto &get()
+    {
+        static dpctl_capi api{};
+        return api;
+    }
+
+    py::object default_sycl_queue_pyobj() { return *default_sycl_queue_; }
+    py::object default_usm_memory_pyobj() { return *default_usm_memory_; }
+    py::object default_usm_ndarray_pyobj() { return *default_usm_ndarray_; }
+    py::object as_usm_memory_pyobj() { return *as_usm_memory_; }
+
+private:
+    struct Deleter
+    {
+        void operator()(py::object *p) const
+        {
+            const bool initialized = Py_IsInitialized();
+#if PY_VERSION_HEX < 0x30d0000
+            const bool finalizing = _Py_IsFinalizing();
+#else
+            const bool finalizing = Py_IsFinalizing();
+#endif
+            const bool guard = initialized && !finalizing;
+
+            if (guard) {
+                delete p;
+            }
+        }
+    };
+
+    std::shared_ptr<py::object> default_sycl_queue_;
+    std::shared_ptr<py::object> default_usm_memory_;
+    std::shared_ptr<py::object> default_usm_ndarray_;
+    std::shared_ptr<py::object> as_usm_memory_;
+
+    dpctl_capi()
+        : Py_SyclDeviceType_(nullptr), PySyclDeviceType_(nullptr),
+          Py_SyclContextType_(nullptr), PySyclContextType_(nullptr),
+          Py_SyclEventType_(nullptr), PySyclEventType_(nullptr),
+          Py_SyclQueueType_(nullptr), PySyclQueueType_(nullptr),
+          Py_MemoryType_(nullptr), PyMemoryUSMDeviceType_(nullptr),
+          PyMemoryUSMSharedType_(nullptr), PyMemoryUSMHostType_(nullptr),
+          PyUSMArrayType_(nullptr), PySyclProgramType_(nullptr),
+          PySyclKernelType_(nullptr), SyclDevice_GetDeviceRef_(nullptr),
+          SyclDevice_Make_(nullptr), SyclContext_GetContextRef_(nullptr),
+          SyclContext_Make_(nullptr), SyclEvent_GetEventRef_(nullptr),
+          SyclEvent_Make_(nullptr), SyclQueue_GetQueueRef_(nullptr),
+          SyclQueue_Make_(nullptr), Memory_GetUsmPointer_(nullptr),
+          Memory_GetOpaquePointer_(nullptr), Memory_GetContextRef_(nullptr),
+          Memory_GetQueueRef_(nullptr), Memory_GetNumBytes_(nullptr),
+          Memory_Make_(nullptr), SyclKernel_GetKernelRef_(nullptr),
+          SyclKernel_Make_(nullptr), SyclProgram_GetKernelBundleRef_(nullptr),
+          SyclProgram_Make_(nullptr), USM_ARRAY_C_CONTIGUOUS_(0),
+          USM_ARRAY_F_CONTIGUOUS_(0), USM_ARRAY_WRITABLE_(0), UAR_BOOL_(-1),
+          UAR_BYTE_(-1), UAR_UBYTE_(-1), UAR_SHORT_(-1), UAR_USHORT_(-1),
+          UAR_INT_(-1), UAR_UINT_(-1), UAR_LONG_(-1), UAR_ULONG_(-1),
+          UAR_LONGLONG_(-1), UAR_ULONGLONG_(-1), UAR_FLOAT_(-1),
+          UAR_DOUBLE_(-1), UAR_CFLOAT_(-1), UAR_CDOUBLE_(-1),
+          UAR_TYPE_SENTINEL_(-1), UAR_HALF_(-1), UAR_INT8_(-1), UAR_UINT8_(-1),
+          UAR_INT16_(-1), UAR_UINT16_(-1), UAR_INT32_(-1), UAR_UINT32_(-1),
+          UAR_INT64_(-1), UAR_UINT64_(-1), default_sycl_queue_{},
+          default_usm_memory_{}, default_usm_ndarray_{}, as_usm_memory_{}
+
+    {
+        // Import dpctl C-API
+        // (device, context, event, queue, memory, program)
+        import_dpctl();
+        // Import dpnp tensor module for PyUSMArrayType
+        import_dpnp__tensor___usmarray();
+
+        // Python type objects for classes implemented by dpctl
+        this->Py_SyclDeviceType_ = &Py_SyclDeviceType;
+        this->PySyclDeviceType_ = &PySyclDeviceType;
+        this->Py_SyclContextType_ = &Py_SyclContextType;
+        this->PySyclContextType_ = &PySyclContextType;
+        this->Py_SyclEventType_ = &Py_SyclEventType;
+        this->PySyclEventType_ = &PySyclEventType;
+        this->Py_SyclQueueType_ = &Py_SyclQueueType;
+        this->PySyclQueueType_ = &PySyclQueueType;
+        this->Py_MemoryType_ = &Py_MemoryType;
+        this->PyMemoryUSMDeviceType_ = &PyMemoryUSMDeviceType;
+        this->PyMemoryUSMSharedType_ = &PyMemoryUSMSharedType;
+        this->PyMemoryUSMHostType_ = &PyMemoryUSMHostType;
+        this->PyUSMArrayType_ = &PyUSMArrayType;
+        this->PySyclProgramType_ = &PySyclProgramType;
+        this->PySyclKernelType_ = &PySyclKernelType;
+
+        // SyclDevice API
+        this->SyclDevice_GetDeviceRef_ = SyclDevice_GetDeviceRef;
+        this->SyclDevice_Make_ = SyclDevice_Make;
+
+        // SyclContext API
+        this->SyclContext_GetContextRef_ = SyclContext_GetContextRef;
+        this->SyclContext_Make_ = SyclContext_Make;
+
+        // SyclEvent API
+        this->SyclEvent_GetEventRef_ = SyclEvent_GetEventRef;
+        this->SyclEvent_Make_ = SyclEvent_Make;
+
+        // SyclQueue API
+        this->SyclQueue_GetQueueRef_ = SyclQueue_GetQueueRef;
+        this->SyclQueue_Make_ = SyclQueue_Make;
+
+        // dpctl.memory API
+        this->Memory_GetUsmPointer_ = Memory_GetUsmPointer;
+        this->Memory_GetOpaquePointer_ = Memory_GetOpaquePointer;
+        this->Memory_GetContextRef_ = Memory_GetContextRef;
+        this->Memory_GetQueueRef_ = Memory_GetQueueRef;
+        this->Memory_GetNumBytes_ = Memory_GetNumBytes;
+        this->Memory_Make_ = Memory_Make;
+
+        // dpctl.program API
+        this->SyclKernel_GetKernelRef_ = SyclKernel_GetKernelRef;
+        this->SyclKernel_Make_ = SyclKernel_Make;
+        this->SyclProgram_GetKernelBundleRef_ = SyclProgram_GetKernelBundleRef;
+        this->SyclProgram_Make_ = SyclProgram_Make;
+
+        // constants
+        this->USM_ARRAY_C_CONTIGUOUS_ = USM_ARRAY_C_CONTIGUOUS;
+        this->USM_ARRAY_F_CONTIGUOUS_ = USM_ARRAY_F_CONTIGUOUS;
+        this->USM_ARRAY_WRITABLE_ = USM_ARRAY_WRITABLE;
+        this->UAR_BOOL_ = UAR_BOOL;
+        this->UAR_BYTE_ = UAR_BYTE;
+        this->UAR_UBYTE_ = UAR_UBYTE;
+        this->UAR_SHORT_ = UAR_SHORT;
+        this->UAR_USHORT_ = UAR_USHORT;
+        this->UAR_INT_ = UAR_INT;
+        this->UAR_UINT_ = UAR_UINT;
+        this->UAR_LONG_ = UAR_LONG;
+        this->UAR_ULONG_ = UAR_ULONG;
+        this->UAR_LONGLONG_ = UAR_LONGLONG;
+        this->UAR_ULONGLONG_ = UAR_ULONGLONG;
+        this->UAR_FLOAT_ = UAR_FLOAT;
+        this->UAR_DOUBLE_ = UAR_DOUBLE;
+        this->UAR_CFLOAT_ = UAR_CFLOAT;
+        this->UAR_CDOUBLE_ = UAR_CDOUBLE;
+        this->UAR_TYPE_SENTINEL_ = UAR_TYPE_SENTINEL;
+        this->UAR_HALF_ = UAR_HALF;
+
+        // deduced disjoint types
+        this->UAR_INT8_ = UAR_BYTE;
+        this->UAR_UINT8_ = UAR_UBYTE;
+        this->UAR_INT16_ = UAR_SHORT;
+        this->UAR_UINT16_ = UAR_USHORT;
+        this->UAR_INT32_ =
+            platform_typeid_lookup<std::int32_t, long, int, short>(
+                UAR_LONG, UAR_INT, UAR_SHORT);
+        this->UAR_UINT32_ =
+            platform_typeid_lookup<std::uint32_t, unsigned long, unsigned int,
+                                   unsigned short>(UAR_ULONG, UAR_UINT,
+                                                   UAR_USHORT);
+        this->UAR_INT64_ =
+            platform_typeid_lookup<std::int64_t, long, long long, int>(
+                UAR_LONG, UAR_LONGLONG, UAR_INT);
+        this->UAR_UINT64_ =
+            platform_typeid_lookup<std::uint64_t, unsigned long,
+                                   unsigned long long, unsigned int>(
+                UAR_ULONG, UAR_ULONGLONG, UAR_UINT);
+
+        // create shared pointers to python objects used in type-casters
+        // for dpctl::memory::usm_memory and dpctl::tensor::usm_ndarray
+        sycl::queue q_{};
+        PySyclQueueObject *py_q_tmp =
+            SyclQueue_Make(reinterpret_cast<DPCTLSyclQueueRef>(&q_));
+        const py::object &py_sycl_queue = py::reinterpret_steal<py::object>(
+            reinterpret_cast<PyObject *>(py_q_tmp));
+
+        default_sycl_queue_ = std::shared_ptr<py::object>(
+            new py::object(py_sycl_queue), Deleter{});
+
+        py::module_ mod_memory = py::module_::import("dpctl.memory");
+        const py::object &py_as_usm_memory = mod_memory.attr("as_usm_memory");
+        as_usm_memory_ = std::shared_ptr<py::object>(
+            new py::object{py_as_usm_memory}, Deleter{});
+
+        auto mem_kl = mod_memory.attr("MemoryUSMHost");
+        const py::object &py_default_usm_memory =
+            mem_kl(1, py::arg("queue") = py_sycl_queue);
+        default_usm_memory_ = std::shared_ptr<py::object>(
+            new py::object{py_default_usm_memory}, Deleter{});
+
+        py::module_ mod_usmarray = py::module_::import("dpnp.tensor._usmarray");
+        auto tensor_kl = mod_usmarray.attr("usm_ndarray");
+
+        const py::object &py_default_usm_ndarray =
+            tensor_kl(py::tuple(), py::arg("dtype") = py::str("u1"),
+                      py::arg("buffer") = py_default_usm_memory);
+
+        default_usm_ndarray_ = std::shared_ptr<py::object>(
+            new py::object{py_default_usm_ndarray}, Deleter{});
+    }
+
+    dpctl_capi(dpctl_capi const &) = default;
+    dpctl_capi &operator=(dpctl_capi const &) = default;
+    dpctl_capi &operator=(dpctl_capi &&) = default;
+
+}; // struct dpctl_capi
+} // namespace detail
+} // namespace dpctl
+
+namespace pybind11::detail
+{
+#define DPCTL_TYPE_CASTER(type, py_name)                                       \
+protected:                                                                     \
+    std::unique_ptr<type> value;                                               \
+                                                                               \
+public:                                                                        \
+    static constexpr auto name = py_name;                                      \
+    template <                                                                 \
+        typename T_,                                                           \
+        ::pybind11::detail::enable_if_t<                                       \
+            std::is_same<type, ::pybind11::detail::remove_cv_t<T_>>::value,    \
+            int> = 0>                                                          \
+    static ::pybind11::handle cast(T_ *src,                                    \
+                                   ::pybind11::return_value_policy policy,     \
+                                   ::pybind11::handle parent)                  \
+    {                                                                          \
+        if (!src)                                                              \
+            return ::pybind11::none().release();                               \
+        if (policy == ::pybind11::return_value_policy::take_ownership) {       \
+            auto h = cast(std::move(*src), policy, parent);                    \
+            delete src;                                                        \
+            return h;                                                          \
+        }                                                                      \
+        return cast(*src, policy, parent);                                     \
+    }                                                                          \
+    operator type *()                                                          \
+    {                                                                          \
+        return value.get();                                                    \
+    } /* NOLINT(bugprone-macro-parentheses) */                                 \
+    operator type &()                                                          \
+    {                                                                          \
+        return *value;                                                         \
+    } /* NOLINT(bugprone-macro-parentheses) */                                 \
+    operator type &&() &&                                                      \
+    {                                                                          \
+        return std::move(*value);                                              \
+    } /* NOLINT(bugprone-macro-parentheses) */                                 \
+    template <typename T_>                                                     \
+    using cast_op_type = ::pybind11::detail::movable_cast_op_type<T_>
+
+/* This type caster associates ``sycl::queue`` C++ class with
+ * :class:`dpctl.SyclQueue` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::queue>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclQueue_Check_(source)) {
+            DPCTLSyclQueueRef QRef = api.SyclQueue_GetQueueRef_(
+                reinterpret_cast<PySyclQueueObject *>(source));
+            value = std::make_unique<sycl::queue>(
+                *(reinterpret_cast<sycl::queue *>(QRef)));
+            return true;
+        }
+        else {
+            throw py::type_error(
+                "Input is of unexpected type, expected dpctl.SyclQueue");
+        }
+    }
+
+    static handle cast(sycl::queue src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclQueue_Make_(reinterpret_cast<DPCTLSyclQueueRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::queue, _("dpctl.SyclQueue"));
+};
+
+/* This type caster associates ``sycl::device`` C++ class with
+ * :class:`dpctl.SyclDevice` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::device>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclDevice_Check_(source)) {
+            DPCTLSyclDeviceRef DRef = api.SyclDevice_GetDeviceRef_(
+                reinterpret_cast<PySyclDeviceObject *>(source));
+            value = std::make_unique<sycl::device>(
+                *(reinterpret_cast<sycl::device *>(DRef)));
+            return true;
+        }
+        else {
+            throw py::type_error(
+                "Input is of unexpected type, expected dpctl.SyclDevice");
+        }
+    }
+
+    static handle cast(sycl::device src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclDevice_Make_(reinterpret_cast<DPCTLSyclDeviceRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::device, _("dpctl.SyclDevice"));
+};
+
+/* This type caster associates ``sycl::context`` C++ class with
+ * :class:`dpctl.SyclContext` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::context>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclContext_Check_(source)) {
+            DPCTLSyclContextRef CRef = api.SyclContext_GetContextRef_(
+                reinterpret_cast<PySyclContextObject *>(source));
+            value = std::make_unique<sycl::context>(
+                *(reinterpret_cast<sycl::context *>(CRef)));
+            return true;
+        }
+        else {
+            throw py::type_error(
+                "Input is of unexpected type, expected dpctl.SyclContext");
+        }
+    }
+
+    static handle cast(sycl::context src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclContext_Make_(reinterpret_cast<DPCTLSyclContextRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::context, _("dpctl.SyclContext"));
+};
+
+/* This type caster associates ``sycl::event`` C++ class with
+ * :class:`dpctl.SyclEvent` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::event>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclEvent_Check_(source)) {
+            DPCTLSyclEventRef ERef = api.SyclEvent_GetEventRef_(
+                reinterpret_cast<PySyclEventObject *>(source));
+            value = std::make_unique<sycl::event>(
+                *(reinterpret_cast<sycl::event *>(ERef)));
+            return true;
+        }
+        else {
+            throw py::type_error(
+                "Input is of unexpected type, expected dpctl.SyclEvent");
+        }
+    }
+
+    static handle cast(sycl::event src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclEvent_Make_(reinterpret_cast<DPCTLSyclEventRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::event, _("dpctl.SyclEvent"));
+};
+
+/* This type caster associates ``sycl::kernel`` C++ class with
+ * :class:`dpctl.program.SyclKernel` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::kernel>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclKernel_Check_(source)) {
+            DPCTLSyclKernelRef KRef = api.SyclKernel_GetKernelRef_(
+                reinterpret_cast<PySyclKernelObject *>(source));
+            value = std::make_unique<sycl::kernel>(
+                *(reinterpret_cast<sycl::kernel *>(KRef)));
+            return true;
+        }
+        else {
+            throw py::type_error("Input is of unexpected type, expected "
+                                 "dpctl.program.SyclKernel");
+        }
+    }
+
+    static handle cast(sycl::kernel src, return_value_policy, handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp =
+            api.SyclKernel_Make_(reinterpret_cast<DPCTLSyclKernelRef>(&src),
+                                 "dpctl4pybind11_kernel");
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::kernel, _("dpctl.program.SyclKernel"));
+};
+
+/* This type caster associates
+ * ``sycl::kernel_bundle<sycl::bundle_state::executable>`` C++ class with
+ * :class:`dpctl.program.SyclProgram` for the purposes of generation of
+ * Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::kernel_bundle<sycl::bundle_state::executable>>
+{
+public:
+    bool load(handle src, bool)
+    {
+        PyObject *source = src.ptr();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (api.PySyclProgram_Check_(source)) {
+            DPCTLSyclKernelBundleRef KBRef =
+                api.SyclProgram_GetKernelBundleRef_(
+                    reinterpret_cast<PySyclProgramObject *>(source));
+            value = std::make_unique<
+                sycl::kernel_bundle<sycl::bundle_state::executable>>(
+                *(reinterpret_cast<
+                    sycl::kernel_bundle<sycl::bundle_state::executable> *>(
+                    KBRef)));
+            return true;
+        }
+        else {
+            throw py::type_error("Input is of unexpected type, expected "
+                                 "dpctl.program.SyclProgram");
+        }
+    }
+
+    static handle cast(sycl::kernel_bundle<sycl::bundle_state::executable> src,
+                       return_value_policy,
+                       handle)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        auto tmp = api.SyclProgram_Make_(
+            reinterpret_cast<DPCTLSyclKernelBundleRef>(&src));
+        return handle(reinterpret_cast<PyObject *>(tmp));
+    }
+
+    DPCTL_TYPE_CASTER(sycl::kernel_bundle<sycl::bundle_state::executable>,
+                      _("dpctl.program.SyclProgram"));
+};
+
+/* This type caster associates
+ * ``sycl::half`` C++ class with Python :class:`float` for the purposes
+ * of generation of Python bindings by pybind11.
+ */
+template <>
+struct type_caster<sycl::half>
+{
+public:
+    bool load(handle src, bool convert)
+    {
+        double py_value;
+
+        if (!src) {
+            return false;
+        }
+
+        PyObject *source = src.ptr();
+
+        if (convert || PyFloat_Check(source)) {
+            py_value = PyFloat_AsDouble(source);
+        }
+        else {
+            return false;
+        }
+
+        bool py_err = (py_value == double(-1)) && PyErr_Occurred();
+
+        if (py_err) {
+            PyErr_Clear();
+            if (convert && (PyNumber_Check(source) != 0)) {
+                auto tmp = reinterpret_steal<object>(PyNumber_Float(source));
+                return load(tmp, false);
+            }
+            return false;
+        }
+        value = static_cast<sycl::half>(py_value);
+        return true;
+    }
+
+    static handle cast(sycl::half src, return_value_policy, handle)
+    {
+        return PyFloat_FromDouble(static_cast<double>(src));
+    }
+
+    PYBIND11_TYPE_CASTER(sycl::half, _("float"));
+};
+} // namespace pybind11::detail
+
+namespace dpctl
+{
+namespace memory
+{
+// since PYBIND11_OBJECT_CVT uses error_already_set without namespace,
+// this allows to avoid compilation error
+using pybind11::error_already_set;
+
+class usm_memory : public py::object
+{
+public:
+    PYBIND11_OBJECT_CVT(
+        usm_memory,
+        py::object,
+        [](PyObject *o) -> bool {
+            return PyObject_TypeCheck(
+                       o, ::dpctl::detail::dpctl_capi::get().Py_MemoryType_) !=
+                   0;
+        },
+        [](PyObject *o) -> PyObject * { return as_usm_memory(o); })
+
+    usm_memory()
+        : py::object(
+              ::dpctl::detail::dpctl_capi::get().default_usm_memory_pyobj(),
+              borrowed_t{})
+    {
+        if (!m_ptr)
+            throw py::error_already_set();
+    }
+
+    /*! @brief Create usm_memory object from shared pointer that manages
+     *  lifetime of the USM allocation.
+     */
+    usm_memory(void *usm_ptr,
+               std::size_t nbytes,
+               const sycl::queue &q,
+               std::shared_ptr<void> shptr)
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclUSMRef usm_ref = reinterpret_cast<DPCTLSyclUSMRef>(usm_ptr);
+        auto q_uptr = std::make_unique<sycl::queue>(q);
+        DPCTLSyclQueueRef QRef =
+            reinterpret_cast<DPCTLSyclQueueRef>(q_uptr.get());
+
+        auto vacuous_destructor = []() {};
+        py::capsule mock_owner(vacuous_destructor);
+
+        // create memory object owned by mock_owner, it is a new reference
+        PyObject *_memory =
+            api.Memory_Make_(usm_ref, nbytes, QRef, mock_owner.ptr());
+        auto ref_count_decrementer = [](PyObject *o) noexcept { Py_DECREF(o); };
+
+        using py_uptrT =
+            std::unique_ptr<PyObject, decltype(ref_count_decrementer)>;
+
+        if (!_memory) {
+            throw py::error_already_set();
+        }
+
+        auto memory_uptr = py_uptrT(_memory, ref_count_decrementer);
+        std::shared_ptr<void> *opaque_ptr = new std::shared_ptr<void>(shptr);
+
+        Py_MemoryObject *memobj = reinterpret_cast<Py_MemoryObject *>(_memory);
+        // replace mock_owner capsule as the owner
+        memobj->refobj = Py_None;
+        // set opaque ptr field, usm_memory now knowns that USM is managed
+        // by smart pointer
+        memobj->_opaque_ptr = reinterpret_cast<void *>(opaque_ptr);
+
+        // _memory will delete created copies of sycl::queue, and
+        // std::shared_ptr and the deleter of the shared_ptr<void> is
+        // supposed to free the USM allocation
+        m_ptr = _memory;
+        q_uptr.release();
+        memory_uptr.release();
+    }
+
+    sycl::queue get_queue() const
+    {
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclQueueRef QRef = api.Memory_GetQueueRef_(mem_obj);
+        sycl::queue *obj_q = reinterpret_cast<sycl::queue *>(QRef);
+        return *obj_q;
+    }
+
+    char *get_pointer() const
+    {
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclUSMRef MRef = api.Memory_GetUsmPointer_(mem_obj);
+        return reinterpret_cast<char *>(MRef);
+    }
+
+    std::size_t get_nbytes() const
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        return api.Memory_GetNumBytes_(mem_obj);
+    }
+
+    bool is_managed_by_smart_ptr() const
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
+
+        return bool(opaque_ptr);
+    }
+
+    const std::shared_ptr<void> &get_smart_ptr_owner() const
+    {
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
+        void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
+
+        if (opaque_ptr) {
+            auto shptr_ptr =
+                reinterpret_cast<std::shared_ptr<void> *>(opaque_ptr);
+            return *shptr_ptr;
+        }
+        else {
+            throw std::runtime_error(
+                "Memory object does not have smart pointer "
+                "managing lifetime of USM allocation");
+        }
+    }
+
+protected:
+    static PyObject *as_usm_memory(PyObject *o)
+    {
+        if (o == nullptr) {
+            PyErr_SetString(PyExc_ValueError,
+                            "cannot create a usm_memory from a nullptr");
+            return nullptr;
+        }
+
+        auto converter =
+            ::dpctl::detail::dpctl_capi::get().as_usm_memory_pyobj();
+
+        py::object res;
+        try {
+            res = converter(py::handle(o));
+        } catch (const py::error_already_set &e) {
+            return nullptr;
+        }
+        return res.ptr();
+    }
+};
+} // end namespace memory
+
+namespace tensor
+{
+inline std::vector<py::ssize_t>
+    c_contiguous_strides(int nd,
+                         const py::ssize_t *shape,
+                         py::ssize_t element_size = 1)
+{
+    if (nd > 0) {
+        std::vector<py::ssize_t> c_strides(nd, element_size);
+        for (int ic = nd - 1; ic > 0;) {
+            py::ssize_t next_v = c_strides[ic] * shape[ic];
+            c_strides[--ic] = next_v;
+        }
+        return c_strides;
+    }
+    else {
+        return std::vector<py::ssize_t>();
+    }
+}
+
+inline std::vector<py::ssize_t>
+    f_contiguous_strides(int nd,
+                         const py::ssize_t *shape,
+                         py::ssize_t element_size = 1)
+{
+    if (nd > 0) {
+        std::vector<py::ssize_t> f_strides(nd, element_size);
+        for (int i = 0; i < nd - 1;) {
+            py::ssize_t next_v = f_strides[i] * shape[i];
+            f_strides[++i] = next_v;
+        }
+        return f_strides;
+    }
+    else {
+        return std::vector<py::ssize_t>();
+    }
+}
+
+inline std::vector<py::ssize_t>
+    c_contiguous_strides(const std::vector<py::ssize_t> &shape,
+                         py::ssize_t element_size = 1)
+{
+    return c_contiguous_strides(shape.size(), shape.data(), element_size);
+}
+
+inline std::vector<py::ssize_t>
+    f_contiguous_strides(const std::vector<py::ssize_t> &shape,
+                         py::ssize_t element_size = 1)
+{
+    return f_contiguous_strides(shape.size(), shape.data(), element_size);
+}
+
+class usm_ndarray : public py::object
+{
+public:
+    PYBIND11_OBJECT(usm_ndarray, py::object, [](PyObject *o) -> bool {
+        return PyObject_TypeCheck(
+                   o, ::dpctl::detail::dpctl_capi::get().PyUSMArrayType_) != 0;
+    })
+
+    usm_ndarray()
+        : py::object(
+              ::dpctl::detail::dpctl_capi::get().default_usm_ndarray_pyobj(),
+              borrowed_t{})
+    {
+        if (!m_ptr)
+            throw py::error_already_set();
+    }
+
+    char *get_data() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+        return raw_ar->data_;
+    }
+
+    template <typename T>
+    T *get_data() const
+    {
+        return reinterpret_cast<T *>(get_data());
+    }
+
+    int get_ndim() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+        return raw_ar->nd_;
+    }
+
+    const py::ssize_t *get_shape_raw() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+        return raw_ar->shape_;
+    }
+
+    std::vector<py::ssize_t> get_shape_vector() const
+    {
+        auto raw_sh = get_shape_raw();
+        auto nd = get_ndim();
+
+        std::vector<py::ssize_t> shape_vector(raw_sh, raw_sh + nd);
+        return shape_vector;
+    }
+
+    py::ssize_t get_shape(int i) const
+    {
+        auto shape_ptr = get_shape_raw();
+        return shape_ptr[i];
+    }
+
+    const py::ssize_t *get_strides_raw() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+        return raw_ar->strides_;
+    }
+
+    std::vector<py::ssize_t> get_strides_vector() const
+    {
+        auto raw_st = get_strides_raw();
+        auto nd = get_ndim();
+
+        if (raw_st == nullptr) {
+            auto is_c_contig = is_c_contiguous();
+            auto is_f_contig = is_f_contiguous();
+            auto raw_sh = get_shape_raw();
+            if (is_c_contig) {
+                const auto &contig_strides = c_contiguous_strides(nd, raw_sh);
+                return contig_strides;
+            }
+            else if (is_f_contig) {
+                const auto &contig_strides = f_contiguous_strides(nd, raw_sh);
+                return contig_strides;
+            }
+            else {
+                throw std::runtime_error("Invalid array encountered when "
+                                         "building strides");
+            }
+        }
+        else {
+            std::vector<py::ssize_t> st_vec(raw_st, raw_st + nd);
+            return st_vec;
+        }
+    }
+
+    py::ssize_t get_size() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        int ndim = raw_ar->nd_;
+        const py::ssize_t *shape = raw_ar->shape_;
+
+        py::ssize_t nelems = 1;
+        for (int i = 0; i < ndim; ++i) {
+            nelems *= shape[i];
+        }
+
+        assert(nelems >= 0);
+        return nelems;
+    }
+
+    std::pair<py::ssize_t, py::ssize_t> get_minmax_offsets() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+
+        int nd = raw_ar->nd_;
+        const py::ssize_t *shape = raw_ar->shape_;
+        const py::ssize_t *strides = raw_ar->strides_;
+
+        py::ssize_t offset_min = 0;
+        py::ssize_t offset_max = 0;
+        if (strides == nullptr) {
+            py::ssize_t stride(1);
+            for (int i = 0; i < nd; ++i) {
+                offset_max += stride * (shape[i] - 1);
+                stride *= shape[i];
+            }
+        }
+        else {
+            for (int i = 0; i < nd; ++i) {
+                py::ssize_t delta = strides[i] * (shape[i] - 1);
+                if (strides[i] > 0) {
+                    offset_max += delta;
+                }
+                else {
+                    offset_min += delta;
+                }
+            }
+        }
+        return std::make_pair(offset_min, offset_max);
+    }
+
+    sycl::queue get_queue() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+        Py_MemoryObject *mem_obj =
+            reinterpret_cast<Py_MemoryObject *>(raw_ar->base_);
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclQueueRef QRef = api.Memory_GetQueueRef_(mem_obj);
+        return *(reinterpret_cast<sycl::queue *>(QRef));
+    }
+
+    sycl::device get_device() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+        Py_MemoryObject *mem_obj =
+            reinterpret_cast<Py_MemoryObject *>(raw_ar->base_);
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        DPCTLSyclQueueRef QRef = api.Memory_GetQueueRef_(mem_obj);
+        return reinterpret_cast<sycl::queue *>(QRef)->get_device();
+    }
+
+    int get_typenum() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+        return raw_ar->typenum_;
+    }
+
+    int get_flags() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+        return raw_ar->flags_;
+    }
+
+    int get_elemsize() const
+    {
+        int typenum = get_typenum();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+
+        // Lookup table for element sizes based on typenum
+        if (typenum == api.UAR_BOOL_)
+            return 1;
+        if (typenum == api.UAR_BYTE_)
+            return 1;
+        if (typenum == api.UAR_UBYTE_)
+            return 1;
+        if (typenum == api.UAR_SHORT_)
+            return 2;
+        if (typenum == api.UAR_USHORT_)
+            return 2;
+        if (typenum == api.UAR_INT_)
+            return 4;
+        if (typenum == api.UAR_UINT_)
+            return 4;
+        if (typenum == api.UAR_LONG_)
+            return sizeof(long);
+        if (typenum == api.UAR_ULONG_)
+            return sizeof(unsigned long);
+        if (typenum == api.UAR_LONGLONG_)
+            return 8;
+        if (typenum == api.UAR_ULONGLONG_)
+            return 8;
+        if (typenum == api.UAR_FLOAT_)
+            return 4;
+        if (typenum == api.UAR_DOUBLE_)
+            return 8;
+        if (typenum == api.UAR_CFLOAT_)
+            return 8;
+        if (typenum == api.UAR_CDOUBLE_)
+            return 16;
+        if (typenum == api.UAR_HALF_)
+            return 2;
+
+        return 0; // Unknown type
+    }
+
+    bool is_c_contiguous() const
+    {
+        int flags = get_flags();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return static_cast<bool>(flags & api.USM_ARRAY_C_CONTIGUOUS_);
+    }
+
+    bool is_f_contiguous() const
+    {
+        int flags = get_flags();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return static_cast<bool>(flags & api.USM_ARRAY_F_CONTIGUOUS_);
+    }
+
+    bool is_writable() const
+    {
+        int flags = get_flags();
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        return static_cast<bool>(flags & api.USM_ARRAY_WRITABLE_);
+    }
+
+    /*! @brief Get usm_data property of array */
+    py::object get_usm_data() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+        // base_ is the Memory object - return new reference
+        PyObject *usm_data = raw_ar->base_;
+        Py_XINCREF(usm_data);
+
+        // pass reference ownership to py::object
+        return py::reinterpret_steal<py::object>(usm_data);
+    }
+
+    bool is_managed_by_smart_ptr() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+        PyObject *usm_data = raw_ar->base_;
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+        if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) {
+            return false;
+        }
+
+        Py_MemoryObject *mem_obj =
+            reinterpret_cast<Py_MemoryObject *>(usm_data);
+        const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
+
+        return bool(opaque_ptr);
+    }
+
+    const std::shared_ptr<void> &get_smart_ptr_owner() const
+    {
+        PyUSMArrayObject *raw_ar = usm_array_ptr();
+        PyObject *usm_data = raw_ar->base_;
+
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+
+        if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) {
+            throw std::runtime_error(
+                "usm_ndarray object does not have Memory object "
+                "managing lifetime of USM allocation");
+        }
+
+        Py_MemoryObject *mem_obj =
+            reinterpret_cast<Py_MemoryObject *>(usm_data);
+        void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
+
+        if (opaque_ptr) {
+            auto shptr_ptr =
+                reinterpret_cast<std::shared_ptr<void> *>(opaque_ptr);
+            return *shptr_ptr;
+        }
+        else {
+            throw std::runtime_error(
+                "Memory object underlying usm_ndarray does not have "
+                "smart pointer managing lifetime of USM allocation");
+        }
+    }
+
+private:
+    PyUSMArrayObject *usm_array_ptr() const
+    {
+        return reinterpret_cast<PyUSMArrayObject *>(m_ptr);
+    }
+};
+} // end namespace tensor
+
+namespace utils
+{
+namespace detail
+{
+struct ManagedMemory
+{
+
+    static bool is_usm_managed_by_shared_ptr(const py::object &h)
+    {
+        if (py::isinstance<dpctl::memory::usm_memory>(h)) {
+            const auto &usm_memory_inst =
+                py::cast<dpctl::memory::usm_memory>(h);
+            return usm_memory_inst.is_managed_by_smart_ptr();
+        }
+        else if (py::isinstance<dpctl::tensor::usm_ndarray>(h)) {
+            const auto &usm_array_inst =
+                py::cast<dpctl::tensor::usm_ndarray>(h);
+            return usm_array_inst.is_managed_by_smart_ptr();
+        }
+
+        return false;
+    }
+
+    static const std::shared_ptr<void> &extract_shared_ptr(const py::object &h)
+    {
+        if (py::isinstance<dpctl::memory::usm_memory>(h)) {
+            const auto &usm_memory_inst =
+                py::cast<dpctl::memory::usm_memory>(h);
+            return usm_memory_inst.get_smart_ptr_owner();
+        }
+        else if (py::isinstance<dpctl::tensor::usm_ndarray>(h)) {
+            const auto &usm_array_inst =
+                py::cast<dpctl::tensor::usm_ndarray>(h);
+            return usm_array_inst.get_smart_ptr_owner();
+        }
+
+        throw std::runtime_error(
+            "Attempted extraction of shared_ptr on an unrecognized type");
+    }
+};
+} // end of namespace detail
+
+template <std::size_t num>
+sycl::event keep_args_alive(sycl::queue &q,
+                            const py::object (&py_objs)[num],
+                            const std::vector<sycl::event> &depends = {})
+{
+    std::size_t n_objects_held = 0;
+    std::array<std::shared_ptr<py::handle>, num> shp_arr{};
+
+    std::size_t n_usm_owners_held = 0;
+    std::array<std::shared_ptr<void>, num> shp_usm{};
+
+    for (std::size_t i = 0; i < num; ++i) {
+        const auto &py_obj_i = py_objs[i];
+        if (detail::ManagedMemory::is_usm_managed_by_shared_ptr(py_obj_i)) {
+            const auto &shp =
+                detail::ManagedMemory::extract_shared_ptr(py_obj_i);
+            shp_usm[n_usm_owners_held] = shp;
+            ++n_usm_owners_held;
+        }
+        else {
+            shp_arr[n_objects_held] = std::make_shared<py::handle>(py_obj_i);
+            shp_arr[n_objects_held]->inc_ref();
+            ++n_objects_held;
+        }
+    }
+
+    bool use_depends = true;
+    sycl::event host_task_ev;
+
+    if (n_usm_owners_held > 0) {
+        host_task_ev = q.submit([&](sycl::handler &cgh) {
+            if (use_depends) {
+                cgh.depends_on(depends);
+                use_depends = false;
+            }
+            else {
+                cgh.depends_on(host_task_ev);
+            }
+            cgh.host_task([shp_usm = std::move(shp_usm)]() {
+                // no body, but shared pointers are captured in
+                // the lambda, ensuring that USM allocation is
+                // kept alive
+            });
+        });
+    }
+
+    if (n_objects_held > 0) {
+        host_task_ev = q.submit([&](sycl::handler &cgh) {
+            if (use_depends) {
+                cgh.depends_on(depends);
+                use_depends = false;
+            }
+            else {
+                cgh.depends_on(host_task_ev);
+            }
+            cgh.host_task([n_objects_held, shp_arr = std::move(shp_arr)]() {
+                py::gil_scoped_acquire acquire;
+
+                for (std::size_t i = 0; i < n_objects_held; ++i) {
+                    shp_arr[i]->dec_ref();
+                }
+            });
+        });
+    }
+
+    return host_task_ev;
+}
+
+/*! @brief Check if all allocation queues are the same as the
+    execution queue */
+template <std::size_t num>
+bool queues_are_compatible(const sycl::queue &exec_q,
+                           const sycl::queue (&alloc_qs)[num])
+{
+    for (std::size_t i = 0; i < num; ++i) {
+
+        if (exec_q != alloc_qs[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+/*! @brief Check if all allocation queues of usm_ndarays are the same as
+    the execution queue */
+template <std::size_t num>
+bool queues_are_compatible(const sycl::queue &exec_q,
+                           const ::dpctl::tensor::usm_ndarray (&arrs)[num])
+{
+    for (std::size_t i = 0; i < num; ++i) {
+
+        if (exec_q != arrs[i].get_queue()) {
+            return false;
+        }
+    }
+    return true;
+}
+} // end namespace utils
+} // end namespace dpctl
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index d94a031801f3..9c9110b85384 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -29,11 +29,10 @@
 import math
 import operator
 
-import dpctl.tensor as dpt
-import dpctl.utils as dpu
 import numpy
 
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device
 
@@ -46,7 +45,7 @@
 
 
 def _as_usm_ndarray(a, usm_type, sycl_queue):
-    """Converts input object to `dpctl.tensor.usm_ndarray`"""
+    """Converts input object to `dpnp.tensor.usm_ndarray`"""
 
     if isinstance(a, dpnp_array):
         a = a.get_array()
@@ -340,7 +339,7 @@ class dpnp_nd_grid:
     def __init__(
         self, sparse=False, device=None, usm_type="device", sycl_queue=None
     ):
-        dpu.validate_usm_type(usm_type, allow_none=True)
+        dpt.validate_usm_type(usm_type, allow_none=True)
         self.sparse = sparse
         self.usm_type = "device" if usm_type is None else usm_type
         self.sycl_queue_normalized = dpnp.get_normalized_queue_device(
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 57bf50422fa0..96db4b4fe4e0 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -29,30 +29,32 @@
 import warnings
 from functools import wraps
 
-import dpctl.tensor as dpt
-import dpctl.tensor._copy_utils as dtc
-import dpctl.tensor._tensor_impl as dti
-import dpctl.tensor._type_utils as dtu
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._elementwise_common import (
-    BinaryElementwiseFunc,
-    UnaryElementwiseFunc,
-)
-from dpctl.tensor._scalar_utils import (
-    _get_dtype,
-    _get_shape,
-    _validate_dtype,
-)
 
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
+
+# pylint: disable=no-name-in-module
+import dpnp.tensor as dpt
+import dpnp.tensor._copy_utils as dtc
+import dpnp.tensor._tensor_impl as dti
+import dpnp.tensor._type_utils as dtu
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations
 from dpnp.dpnp_utils.dpnp_utils_common import (
     find_buf_dtype_3out,
     find_buf_dtype_4out,
 )
+from dpnp.tensor._elementwise_common import (
+    BinaryElementwiseFunc,
+    UnaryElementwiseFunc,
+)
+from dpnp.tensor._scalar_utils import (
+    _get_dtype,
+    _get_shape,
+    _validate_dtype,
+)
 
 __all__ = [
     "DPNPI0",
@@ -117,7 +119,7 @@ class DPNPUnaryFunc(UnaryElementwiseFunc):
             sycl_dev - The :class:`dpctl.SyclDevice` where the function
                 evaluation is carried out.
         The function is invoked when the argument of the unary function
-        requires casting, e.g. the argument of `dpctl.tensor.log` is an
+        requires casting, e.g. the argument of `dpnp.tensor.log` is an
         array with integral data type.
 
     """
@@ -135,7 +137,7 @@ def __init__(
         def _call_func(src, dst, sycl_queue, depends=None):
             """
             A callback to register in UnaryElementwiseFunc class of
-            dpctl.tensor
+            dpnp.tensor
             """
 
             if depends is None:
@@ -449,7 +451,7 @@ def __call__(
                     f"Expected output shape is {x.shape}, got {res.shape}"
                 )
 
-            if dpu.get_execution_queue((exec_q, res.sycl_queue)) is None:
+            if dpt.get_execution_queue((exec_q, res.sycl_queue)) is None:
                 raise dpnp.exceptions.ExecutionPlacementError(
                     "Input and output allocation queues are not compatible"
                 )
@@ -586,7 +588,7 @@ class DPNPBinaryFunc(BinaryElementwiseFunc):
                 evaluation is carried out.
         The function is only called when both arguments of the binary
         function require casting, e.g. both arguments of
-        `dpctl.tensor.logaddexp` are arrays with integral data type.
+        `dpnp.tensor.logaddexp` are arrays with integral data type.
     weak_type_resolver : {None, callable}, optional
         Function to influence type promotion behavior for Python scalar types
         of this binary function. The function takes 3 arguments:
@@ -613,7 +615,7 @@ def __init__(
         def _call_func(src1, src2, dst, sycl_queue, depends=None):
             """
             A callback to register in UnaryElementwiseFunc class of
-            dpctl.tensor
+            dpnp.tensor
             """
 
             if depends is None:
@@ -1060,7 +1062,7 @@ def __call__(
                     f"Expected output shape is {res_shape}, got {res.shape}"
                 )
 
-            if dpu.get_execution_queue((exec_q, res.sycl_queue)) is None:
+            if dpt.get_execution_queue((exec_q, res.sycl_queue)) is None:
                 raise dpnp.exceptions.ExecutionPlacementError(
                     "Input and output allocation queues are not compatible"
                 )
diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index c3bfa8fa2e80..84aa9e47b27e 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -28,18 +28,18 @@
 
 from numbers import Number
 
-import dpctl.tensor as dpt
 import dpctl.utils as dpu
-from dpctl.tensor._ctors import _cast_fill_val
-from dpctl.tensor._tensor_impl import (
+
+import dpnp
+import dpnp.tensor as dpt
+from dpnp.exceptions import ExecutionPlacementError
+from dpnp.tensor._ctors import _cast_fill_val
+from dpnp.tensor._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
     _full_usm_ndarray,
     _zeros_usm_ndarray,
 )
 
-import dpnp
-from dpnp.exceptions import ExecutionPlacementError
-
 
 def dpnp_fill(arr, val):
     arr = dpnp.get_usm_ndarray(arr)
@@ -50,7 +50,7 @@ def dpnp_fill(arr, val):
         val = dpnp.get_usm_ndarray(val)
         if val.shape != ():
             raise ValueError("`val` must be a scalar or 0D-array")
-        if dpu.get_execution_queue((exec_q, val.sycl_queue)) is None:
+        if dpt.get_execution_queue((exec_q, val.sycl_queue)) is None:
             raise ExecutionPlacementError(
                 "Input arrays have incompatible queues."
             )
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index 951f782c3007..00a1b2d00e5d 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -32,15 +32,15 @@
 
 """
 
+# pylint: disable=duplicate-code
 # pylint: disable=invalid-name
 # pylint: disable=protected-access
 
 import warnings
 
-import dpctl.tensor as dpt
-import dpctl.tensor._type_utils as dtu
-
 import dpnp
+import dpnp.tensor as dpt
+import dpnp.tensor._type_utils as dtu
 
 from . import memory as dpm
 from .exceptions import AxisError
@@ -72,7 +72,7 @@ class dpnp_array:
     An array object represents a multidimensional tensor of numeric elements
     stored in a USM allocation on a SYCL device.
 
-    This is a wrapper around :class:`dpctl.tensor.usm_ndarray` that provides
+    This is a wrapper around :class:`dpnp.tensor.usm_ndarray` that provides
     methods to be compliant with original NumPy.
 
     """
@@ -609,12 +609,12 @@ def __usm_ndarray__(self):
         """
         Property to support ``__usm_ndarray__`` protocol.
 
-        It assumes to return :class:`dpctl.tensor.usm_ndarray` instance
+        It assumes to return :class:`dpnp.tensor.usm_ndarray` instance
         corresponding to the content of the object.
 
         This property is intended to speed-up conversion from
-        :class:`dpnp.ndarray` to :class:`dpctl.tensor.usm_ndarray` passed into
-        :func:`dpctl.tensor.asarray` function. The input object that implements
+        :class:`dpnp.ndarray` to :class:`dpnp.tensor.usm_ndarray` passed into
+        :func:`dpnp.tensor.asarray` function. The input object that implements
         ``__usm_ndarray__`` protocol is recognized as owner of USM allocation
         that is managed by a smart pointer, and asynchronous deallocation
         will not involve GIL.
@@ -631,13 +631,13 @@ def __xor__(self, other, /):
     def _create_from_usm_ndarray(usm_ary: dpt.usm_ndarray):
         """
         Return :class:`dpnp.ndarray` instance from USM allocation providing
-        by an instance of :class:`dpctl.tensor.usm_ndarray`.
+        by an instance of :class:`dpnp.tensor.usm_ndarray`.
 
         """
 
         if not isinstance(usm_ary, dpt.usm_ndarray):
             raise TypeError(
-                f"Expected dpctl.tensor.usm_ndarray, got {type(usm_ary)}"
+                f"Expected dpnp.tensor.usm_ndarray, got {type(usm_ary)}"
             )
         res = dpnp_array.__new__(dpnp_array)
         res._array_obj = usm_ary
@@ -956,7 +956,7 @@ def astype(
             `device` can be ``None``, a oneAPI filter selector string,
             an instance of :class:`dpctl.SyclDevice` corresponding to
             a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            :class:`dpctl.SyclQueue`, or a :class:`dpnp.tensor.Device` object
             returned by :attr:`dpnp.ndarray.device`.
             If the value is ``None``, returned array is created on the same
             device as that array.
@@ -1067,7 +1067,7 @@ def copy(
             `device` can be ``None``, a oneAPI filter selector string,
             an instance of :class:`dpctl.SyclDevice` corresponding to
             a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            :class:`dpctl.SyclQueue`, or a :class:`dpnp.tensor.Device` object
             returned by :attr:`dpnp.ndarray.device`.
 
             Default: ``None``.
@@ -1162,7 +1162,7 @@ def data(self):
     @property
     def device(self):
         """
-        Return :class:`dpctl.tensor.Device` object representing residence of
+        Return :class:`dpnp.tensor.Device` object representing residence of
         the array data.
 
         The ``Device`` object represents Array API notion of the device, and
@@ -1329,7 +1329,7 @@ def flatten(self, /, order="C"):
         return self.reshape(-1, order=order, copy=True)
 
     def get_array(self):
-        """Get :class:`dpctl.tensor.usm_ndarray` object."""
+        """Get :class:`dpnp.tensor.usm_ndarray` object."""
         return self._array_obj
 
     # 'getfield',
@@ -2182,7 +2182,7 @@ def to_device(self, device, /, *, stream=None):
             `device` can be ``None``, a oneAPI filter selector string,
             an instance of :class:`dpctl.SyclDevice` corresponding to
             a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            :class:`dpctl.SyclQueue`, or a :class:`dpnp.tensor.Device` object
             returned by :attr:`dpnp.ndarray.device`.
         stream : {SyclQueue, None}, optional
             Execution queue to synchronize with. If ``None``, synchronization
diff --git a/dpnp/dpnp_array_api_info.py b/dpnp/dpnp_array_api_info.py
index 6a3939d046b0..ef3f1e4c2b60 100644
--- a/dpnp/dpnp_array_api_info.py
+++ b/dpnp/dpnp_array_api_info.py
@@ -36,7 +36,7 @@
 
 """
 
-import dpctl.tensor as dpt
+import dpnp.tensor as dpt
 
 
 def __array_namespace_info__():
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index 4975db17c717..4b38c2915178 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -35,10 +35,8 @@
 
 """
 
-import dpctl.tensor as dpt
-import dpctl.utils as dpu
-
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.dpnp_array import dpnp_array
 
 __all__ = [
@@ -66,8 +64,8 @@ def arange(
     usm_type="device",
     sycl_queue=None,
 ):
-    """Validate input parameters before passing them into `dpctl.tensor` module"""
-    dpu.validate_usm_type(usm_type, allow_none=False)
+    """Validate input parameters before passing them into `dpnp.tensor` module"""
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
     )
@@ -93,7 +91,7 @@ def asarray(
     sycl_queue=None,
 ):
     """Converts `x1` to `dpnp_array`."""
-    dpu.validate_usm_type(usm_type, allow_none=True)
+    dpt.validate_usm_type(usm_type, allow_none=True)
 
     if order is None:
         order = "K"
@@ -153,8 +151,8 @@ def empty(
     usm_type="device",
     sycl_queue=None,
 ):
-    """Validate input parameters before passing them into `dpctl.tensor` module"""
-    dpu.validate_usm_type(usm_type, allow_none=False)
+    """Validate input parameters before passing them into `dpnp.tensor` module"""
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
     )
@@ -184,8 +182,8 @@ def eye(
     usm_type="device",
     sycl_queue=None,
 ):
-    """Validate input parameters before passing them into `dpctl.tensor` module"""
-    dpu.validate_usm_type(usm_type, allow_none=False)
+    """Validate input parameters before passing them into `dpnp.tensor` module"""
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
     )
@@ -215,8 +213,8 @@ def full(
     usm_type=None,
     sycl_queue=None,
 ):
-    """Validate input parameters before passing them into `dpctl.tensor` module"""
-    dpu.validate_usm_type(usm_type, allow_none=True)
+    """Validate input parameters before passing them into `dpnp.tensor` module"""
+    dpt.validate_usm_type(usm_type, allow_none=True)
 
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         fill_value, sycl_queue=sycl_queue, device=device
@@ -248,8 +246,8 @@ def ones(
     usm_type="device",
     sycl_queue=None,
 ):
-    """Validate input parameters before passing them into `dpctl.tensor` module"""
-    dpu.validate_usm_type(usm_type, allow_none=False)
+    """Validate input parameters before passing them into `dpnp.tensor` module"""
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
     )
@@ -288,8 +286,8 @@ def zeros(
     usm_type="device",
     sycl_queue=None,
 ):
-    """Validate input parameters before passing them into `dpctl.tensor` module"""
-    dpu.validate_usm_type(usm_type, allow_none=False)
+    """Validate input parameters before passing them into `dpnp.tensor` module"""
+    dpt.validate_usm_type(usm_type, allow_none=False)
     sycl_queue_normalized = dpnp.get_normalized_queue_device(
         sycl_queue=sycl_queue, device=device
     )
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index fba1a215756a..c9d16a20e83d 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -40,26 +40,27 @@
 """
 
 # pylint: disable=protected-access
+# pylint: disable=no-name-in-module
 
 import os
 
 import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._device import normalize_queue_device
 
 import dpnp
 
-from .dpnp_array import dpnp_array
-
 # pylint: disable=no-name-in-module
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+
+from .dpnp_array import dpnp_array
 from .dpnp_utils import (
     dpnp_descriptor,
     map_dtype_to_device,
     use_origin_backend,
 )
+from .tensor._device import normalize_queue_device
 
 
 def are_same_logical_tensors(ar1, ar2):
@@ -141,7 +142,7 @@ def asnumpy(a, order="C"):
 
 def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None):
     """
-    Return :class:`dpctl.tensor.usm_ndarray` from input object `a`.
+    Return :class:`dpnp.tensor.usm_ndarray` from input object `a`.
 
     Parameters
     ----------
@@ -158,7 +159,7 @@ def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
         If the value is ``None``, returned array is created on the same device
         as `a`.
@@ -179,7 +180,7 @@ def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None):
     out : usm_ndarray
         A dpctl USM ndarray from input array or scalar `a`.
         If `a` is instance of :class:`dpnp.ndarray`
-        or :class:`dpctl.tensor.usm_ndarray`, no array allocation will be done
+        or :class:`dpnp.tensor.usm_ndarray`, no array allocation will be done
         and `dtype`, `device`, `usm_type`, `sycl_queue` keywords
         will be ignored.
 
@@ -255,7 +256,7 @@ def check_limitations(
 def check_supported_arrays_type(*arrays, scalar_type=False, all_scalars=False):
     """
     Return ``True`` if each array has either type of scalar,
-    :class:`dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray`.
+    :class:`dpnp.ndarray` or :class:`dpnp.tensor.usm_ndarray`.
     But if any array has unsupported type, ``TypeError`` will be raised.
 
     Parameters
@@ -317,7 +318,7 @@ def default_float_type(device=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
         The value ``None`` is interpreted as to use a default device.
 
@@ -406,7 +407,7 @@ def get_dpnp_descriptor(
     if queue is not None and copy_when_nondefault_queue:
         default_queue = dpctl.SyclQueue()
         queue_is_default = (
-            dpctl.utils.get_execution_queue([queue, default_queue]) is not None
+            dpt.get_execution_queue([queue, default_queue]) is not None
         )
         if not queue_is_default:
             ext_obj = dpnp.array(ext_obj, sycl_queue=default_queue)
@@ -433,7 +434,7 @@ def get_include():
 def get_normalized_queue_device(obj=None, device=None, sycl_queue=None):
     """
     Utility to process complementary keyword arguments 'device' and 'sycl_queue'
-    in subsequent calls of functions from `dpctl.tensor` module.
+    in subsequent calls of functions from `dpnp.tensor` module.
 
     If both arguments 'device' and 'sycl_queue' have default value ``None``
     and 'obj' has `sycl_queue` attribute, it assumes that Compute Follows Data
@@ -444,7 +445,7 @@ def get_normalized_queue_device(obj=None, device=None, sycl_queue=None):
     ----------
     obj : object, optional
         A python object. Can be an instance of `dpnp_array`,
-        `dpctl.tensor.usm_ndarray`, an object representing SYCL USM allocation
+        `dpnp.tensor.usm_ndarray`, an object representing SYCL USM allocation
         and implementing `__sycl_usm_array_interface__` protocol, an instance
         of `numpy.ndarray`, an object supporting Python buffer protocol,
         a Python scalar, or a (possibly nested) sequence of Python scalars.
@@ -461,7 +462,7 @@ def get_normalized_queue_device(obj=None, device=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
         The value ``None`` is interpreted as to use the same device as `obj`.
 
@@ -471,7 +472,7 @@ def get_normalized_queue_device(obj=None, device=None, sycl_queue=None):
     -------
     sycl_queue: dpctl.SyclQueue
         A :class:`dpctl.SyclQueue` object normalized by
-        `normalize_queue_device` call of `dpctl.tensor` module invoked with
+        `normalize_queue_device` call of `dpnp.tensor` module invoked with
         `device` and `sycl_queue` values. If both incoming `device` and
         `sycl_queue` are ``None`` and `obj` has `sycl_queue` attribute,
         the normalization will be performed for `obj.sycl_queue` value.
@@ -539,13 +540,13 @@ def get_result_array(a, out=None, casting="safe"):
 
 def get_usm_ndarray(a):
     """
-    Return :class:`dpctl.tensor.usm_ndarray` from input array `a`.
+    Return :class:`dpnp.tensor.usm_ndarray` from input array `a`.
 
     Parameters
     ----------
     a : {dpnp.ndarray, usm_ndarray}
         Input array of supported type :class:`dpnp.ndarray`
-        or :class:`dpctl.tensor.usm_ndarray`.
+        or :class:`dpnp.tensor.usm_ndarray`.
 
     Returns
     -------
@@ -570,13 +571,13 @@ def get_usm_ndarray(a):
 
 def get_usm_ndarray_or_scalar(a):
     """
-    Return scalar or :class:`dpctl.tensor.usm_ndarray` from input object `a`.
+    Return scalar or :class:`dpnp.tensor.usm_ndarray` from input object `a`.
 
     Parameters
     ----------
     a : {scalar, dpnp_array, usm_ndarray}
         Input of any supported type: scalar, :class:`dpnp.ndarray`
-        or :class:`dpctl.tensor.usm_ndarray`.
+        or :class:`dpnp.tensor.usm_ndarray`.
 
     Returns
     -------
@@ -633,7 +634,7 @@ def is_cuda_backend(obj=None):
 def is_supported_array_or_scalar(a):
     """
     Return ``True`` if `a` is a scalar or an array of either
-    :class:`dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray` type,
+    :class:`dpnp.ndarray` or :class:`dpnp.tensor.usm_ndarray` type,
     ``False`` otherwise.
 
     Parameters
@@ -655,7 +656,7 @@ def is_supported_array_or_scalar(a):
 def is_supported_array_type(a):
     """
     Return ``True`` if an array of either type :class:`dpnp.ndarray`
-    or :class:`dpctl.tensor.usm_ndarray` type, ``False`` otherwise.
+    or :class:`dpnp.tensor.usm_ndarray` type, ``False`` otherwise.
 
     Parameters
     ----------
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 5bcf5ea19b82..da6b45517eb3 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -43,10 +43,10 @@
 
 import operator
 
-import dpctl.tensor as dpt
 import numpy
 
 import dpnp
+import dpnp.tensor as dpt
 from dpnp import dpnp_container
 
 from .dpnp_algo.dpnp_arraycreation import (
@@ -175,7 +175,7 @@ def arange(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -295,7 +295,7 @@ def array(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -440,7 +440,7 @@ def asanyarray(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -545,7 +545,7 @@ def asarray(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -646,7 +646,7 @@ def ascontiguousarray(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -760,7 +760,7 @@ def asfortranarray(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -897,7 +897,7 @@ def astype(x, dtype, /, *, order="K", casting="unsafe", copy=True, device=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
         If the value is ``None``, returned array is created on the same device
         as `x`.
@@ -966,7 +966,7 @@ def copy(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1086,7 +1086,7 @@ def diag(v, /, k=0, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1191,7 +1191,7 @@ def diagflat(v, /, k=0, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1297,7 +1297,7 @@ def empty(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1403,7 +1403,7 @@ def empty_like(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1515,7 +1515,7 @@ def eye(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1627,7 +1627,7 @@ def frombuffer(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1747,7 +1747,7 @@ def fromfile(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1868,7 +1868,7 @@ def fromfunction(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1979,7 +1979,7 @@ def fromiter(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2081,7 +2081,7 @@ def fromstring(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2158,9 +2158,9 @@ def from_dlpack(x, /, *, device=None, copy=None):
           to a non-partitioned SYCL device.
         * :class:`dpctl.SyclQueue` : Implies SYCL device targeted by the SYCL
           queue.
-        * :class:`dpctl.tensor.Device` : Implies SYCL device
+        * :class:`dpnp.tensor.Device` : Implies SYCL device
           ``device.sycl_queue``. The `device` object is obtained via
-          :attr:`dpctl.tensor.usm_ndarray.device`.
+          :attr:`dpnp.tensor.usm_ndarray.device`.
         * ``(device_type, device_id)`` : 2-tuple matching the format of the
           output of the :meth:`dpnp.ndarray.__dlpack_device__`: an integer
           enumerator representing the device type followed by an integer
@@ -2205,7 +2205,7 @@ def from_dlpack(x, /, *, device=None, copy=None):
     If the return type is :class:`dpnp.ndarray`, the associated SYCL queue is
     derived from the `device` keyword. When `device` keyword value has type
     :class:`dpctl.SyclQueue`, the explicit queue instance is used, when `device`
-    keyword value has type :class:`dpctl.tensor.Device`, the
+    keyword value has type :class:`dpnp.tensor.Device`, the
     ``device.sycl_queue`` is used. In all other cases, the cached SYCL queue
     corresponding to the implied SYCL device is used.
 
@@ -2261,7 +2261,7 @@ def full(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2370,7 +2370,7 @@ def full_like(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2485,7 +2485,7 @@ def geomspace(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2597,7 +2597,7 @@ def identity(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2715,7 +2715,7 @@ def linspace(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2827,7 +2827,7 @@ def loadtxt(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2942,7 +2942,7 @@ def logspace(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3148,7 +3148,7 @@ class MGridClass:
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3227,7 +3227,7 @@ class OGridClass:
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3317,7 +3317,7 @@ def ones(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3429,7 +3429,7 @@ def ones_like(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3602,7 +3602,7 @@ def tri(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3840,7 +3840,7 @@ def vander(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -3970,7 +3970,7 @@ def zeros(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -4082,7 +4082,7 @@ def zeros_like(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/dpnp_iface_bitwise.py b/dpnp/dpnp_iface_bitwise.py
index 733fbc697241..604fd365ee18 100644
--- a/dpnp/dpnp_iface_bitwise.py
+++ b/dpnp/dpnp_iface_bitwise.py
@@ -43,10 +43,10 @@
 # pylint: disable=no-name-in-module
 # pylint: disable=protected-access
 
-import dpctl.tensor._tensor_elementwise_impl as ti
 import numpy
 
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
+import dpnp.tensor._tensor_elementwise_impl as ti
 from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
 
 
diff --git a/dpnp/dpnp_iface_counting.py b/dpnp/dpnp_iface_counting.py
index a4b85aa85294..7bb13422f819 100644
--- a/dpnp/dpnp_iface_counting.py
+++ b/dpnp/dpnp_iface_counting.py
@@ -39,9 +39,8 @@
 
 """
 
-import dpctl.tensor as dpt
-
 import dpnp
+import dpnp.tensor as dpt
 
 
 def count_nonzero(a, axis=None, *, keepdims=False, out=None):
diff --git a/dpnp/dpnp_iface_functional.py b/dpnp/dpnp_iface_functional.py
index 1985eced2e71..0ed965b0698f 100644
--- a/dpnp/dpnp_iface_functional.py
+++ b/dpnp/dpnp_iface_functional.py
@@ -41,15 +41,14 @@
 
 # pylint: disable=protected-access
 
-from dpctl.tensor._numpy_helper import (
-    normalize_axis_index,
-    normalize_axis_tuple,
-)
-
 import dpnp
 
 # pylint: disable=no-name-in-module
-from dpnp.dpnp_utils import get_usm_allocations
+from .dpnp_utils import get_usm_allocations
+from .tensor._numpy_helper import (
+    normalize_axis_index,
+    normalize_axis_tuple,
+)
 
 
 def apply_along_axis(func1d, axis, arr, *args, **kwargs):
diff --git a/dpnp/dpnp_iface_histograms.py b/dpnp/dpnp_iface_histograms.py
index 0a2f18fe3644..7e91968926fc 100644
--- a/dpnp/dpnp_iface_histograms.py
+++ b/dpnp/dpnp_iface_histograms.py
@@ -53,6 +53,7 @@
     result_type_for_device,
     to_supported_dtypes,
 )
+from dpnp.tensor import get_coerced_usm_type, get_execution_queue
 
 # pylint: disable=no-name-in-module
 from .dpnp_utils import get_usm_allocations
@@ -87,10 +88,10 @@ def _ravel_check_a_and_weights(a, weights):
     if weights is not None:
         # check that `weights` array has supported type
         dpnp.check_supported_arrays_type(weights)
-        usm_type = dpu.get_coerced_usm_type([usm_type, weights.usm_type])
+        usm_type = get_coerced_usm_type([usm_type, weights.usm_type])
 
         # check that arrays have the same allocation queue
-        if dpu.get_execution_queue([a.sycl_queue, weights.sycl_queue]) is None:
+        if get_execution_queue([a.sycl_queue, weights.sycl_queue]) is None:
             raise ValueError(
                 "a and weights must be allocated on the same SYCL queue"
             )
@@ -173,7 +174,7 @@ def _get_bin_edges(a, bins, range, usm_type):
 
     elif numpy.ndim(bins) == 1:
         if dpnp.is_supported_array_type(bins):
-            if dpu.get_execution_queue([a.sycl_queue, bins.sycl_queue]) is None:
+            if get_execution_queue([a.sycl_queue, bins.sycl_queue]) is None:
                 raise ValueError(
                     "a and bins must be allocated on the same SYCL queue"
                 )
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index 2a90f6cff637..1c9776582b73 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -44,19 +44,18 @@
 import operator
 from collections.abc import Iterable
 
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._copy_utils import _nonzero_impl
-from dpctl.tensor._indexing_functions import _get_indexing_mode
-from dpctl.tensor._numpy_helper import normalize_axis_index
 
 import dpnp
 
 # pylint: disable=no-name-in-module
 import dpnp.backend.extensions.indexing._indexing_impl as indexing_ext
 
+# pylint: disable=no-name-in-module
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+
 # pylint: disable=no-name-in-module
 from .dpnp_algo import (
     dpnp_putmask,
@@ -64,6 +63,9 @@
 from .dpnp_array import dpnp_array
 from .dpnp_utils import call_origin, get_usm_allocations
 from .exceptions import ExecutionPlacementError
+from .tensor._copy_utils import _nonzero_impl
+from .tensor._indexing_functions import _get_indexing_mode
+from .tensor._numpy_helper import normalize_axis_index
 
 
 def _ravel_multi_index_checks(multi_index, dims, order):
@@ -99,7 +101,7 @@ def _build_choices_list(choices):
     list of arrays. If a single array of dimension greater than one, the array
     will be unstacked.
 
-    Returns a list of :class:`dpctl.tensor.usm_ndarray`s.
+    Returns a list of :class:`dpnp.tensor.usm_ndarray`s.
     """
 
     if dpnp.is_supported_array_type(choices):
@@ -129,7 +131,7 @@ def _choose_run(inds, chcs, q, usm_type, out=None, mode=0):
                 f"got {out.dtype}"
             )
 
-        if dpu.get_execution_queue((q, out.sycl_queue)) is None:
+        if dpt.get_execution_queue((q, out.sycl_queue)) is None:
             raise ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
@@ -291,7 +293,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
                 f"Output array of type {x.dtype} is needed, " f"got {out.dtype}"
             )
 
-        if dpu.get_execution_queue((q, out.sycl_queue)) is None:
+        if dpt.get_execution_queue((q, out.sycl_queue)) is None:
             raise ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
@@ -445,7 +447,7 @@ def diag_indices(n, ndim=2, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1044,7 +1046,7 @@ def indices(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1308,7 +1310,7 @@ def mask_indices(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2321,7 +2323,7 @@ def tril_indices(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -2538,7 +2540,7 @@ def triu_indices(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index 6464bd49af1b..faa84dd538a4 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -43,16 +43,15 @@
 # pylint: disable=duplicate-code
 # pylint: disable=no-name-in-module
 
-
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_elementwise_impl as ti
 import dpctl.utils as dpu
 import numpy
 
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
-from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as ti
 
+from .dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
 from .dpnp_array import dpnp_array
 from .dpnp_utils import get_usm_allocations
 from .exceptions import ExecutionPlacementError
@@ -1263,7 +1262,7 @@ def isin(
         usm_element = dpnp.get_usm_ndarray(element)
     else:
         if (
-            dpu.get_execution_queue(
+            dpt.get_execution_queue(
                 (element.sycl_queue, test_elements.sycl_queue)
             )
             is None
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 0594a406ac5a..b96d36a40e6a 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -45,14 +45,10 @@
 from typing import NamedTuple
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
-from dpctl.tensor._numpy_helper import (
-    normalize_axis_index,
-    normalize_axis_tuple,
-)
 
 import dpnp
+import dpnp.tensor as dpt
 
 from .dpnp_array import dpnp_array
 
@@ -60,6 +56,10 @@
 from .dpnp_utils import get_usm_allocations
 from .dpnp_utils.dpnp_utils_pad import dpnp_pad
 from .exceptions import AxisError
+from .tensor._numpy_helper import (
+    normalize_axis_index,
+    normalize_axis_tuple,
+)
 
 
 class InsertDeleteParams(NamedTuple):
@@ -692,7 +692,7 @@ def asarray_chkfinite(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -791,7 +791,7 @@ def asfarray(a, dtype=None, *, device=None, usm_type=None, sycl_queue=None):
     a : array_like
         Input data, in any form that can be converted to an array.
         This includes an instance of :class:`dpnp.ndarray` or
-        :class:`dpctl.tensor.usm_ndarray`, an object representing
+        :class:`dpnp.tensor.usm_ndarray`, an object representing
         SYCL USM allocation and implementing `__sycl_usm_array_interface__`
         protocol, an instance of :class:`numpy.ndarray`, an object supporting
         Python buffer protocol, a Python scalar, or a (possibly nested)
@@ -808,7 +808,7 @@ def asfarray(a, dtype=None, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index e06904a57bda..ddecd1f751d9 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -40,25 +40,21 @@
 """
 
 # pylint: disable=protected-access
+# pylint: disable=duplicate-code
 # pylint: disable=no-name-in-module
 
 
 import builtins
 import warnings
 
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_elementwise_impl as ti
-import dpctl.tensor._type_utils as dtu
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._numpy_helper import (
-    normalize_axis_index,
-    normalize_axis_tuple,
-)
-from dpctl.tensor._type_utils import _acceptance_fn_divide
 
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as ti
+import dpnp.tensor._type_utils as dtu
 
 from .dpnp_algo.dpnp_elementwise_common import (
     DPNPI0,
@@ -85,6 +81,10 @@
 from .dpnp_utils.dpnp_utils_linearalgebra import dpnp_cross
 from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call
 from .exceptions import ExecutionPlacementError
+from .tensor._numpy_helper import (
+    normalize_axis_index,
+    normalize_axis_tuple,
+)
 
 
 def _get_max_min(dtype):
@@ -273,9 +273,9 @@ def _process_ediff1d_args(arg, arg_name, ary_dtype, ary_sycl_queue, usm_type):
     if not dpnp.is_supported_array_type(arg):
         arg = dpnp.asarray(arg, usm_type=usm_type, sycl_queue=ary_sycl_queue)
     else:
-        usm_type = dpu.get_coerced_usm_type([usm_type, arg.usm_type])
+        usm_type = dpt.get_coerced_usm_type([usm_type, arg.usm_type])
         # check that arrays have the same allocation queue
-        if dpu.get_execution_queue([ary_sycl_queue, arg.sycl_queue]) is None:
+        if dpt.get_execution_queue([ary_sycl_queue, arg.sycl_queue]) is None:
             raise ExecutionPlacementError(
                 f"ary and {arg_name} must be allocated on the same SYCL queue"
             )
@@ -307,7 +307,7 @@ def _validate_interp_param(param, name, exec_q, usm_type, dtype=None):
                 f"a {name} value must be 0-dimensional, "
                 f"but got {param.ndim}-dim"
             )
-        if dpu.get_execution_queue([exec_q, param.sycl_queue]) is None:
+        if dpt.get_execution_queue([exec_q, param.sycl_queue]) is None:
             raise ValueError(
                 f"input arrays and {name} must be allocated "
                 "on the same SYCL queue"
@@ -1564,7 +1564,7 @@ def diff(a, n=1, axis=-1, prepend=None, append=None):
     mkl_fn_to_call="_mkl_div_to_call",
     mkl_impl_fn="_div",
     binary_inplace_fn=ti._divide_inplace,
-    acceptance_fn=_acceptance_fn_divide,
+    acceptance_fn=dtu._acceptance_fn_divide,
 )
 
 
@@ -2724,7 +2724,7 @@ def gradient(f, *varargs, axis=None, edge_order=1):
         if dpnp.isscalar(ax_dx):
             usm_type = f.usm_type
         else:
-            usm_type = dpu.get_coerced_usm_type([f.usm_type, ax_dx.usm_type])
+            usm_type = dpt.get_coerced_usm_type([f.usm_type, ax_dx.usm_type])
         out = dpnp.empty_like(f, dtype=otype, usm_type=usm_type)
 
         # spacing for the current axis
diff --git a/dpnp/dpnp_iface_nanfunctions.py b/dpnp/dpnp_iface_nanfunctions.py
index a5fb750cf586..10fffb342305 100644
--- a/dpnp/dpnp_iface_nanfunctions.py
+++ b/dpnp/dpnp_iface_nanfunctions.py
@@ -167,7 +167,7 @@ def nanargmax(a, axis=None, out=None, *, keepdims=False):
     Limitations
     -----------
     Input array is only supported as either :class:`dpnp.ndarray`
-    or :class:`dpctl.tensor.usm_ndarray`.
+    or :class:`dpnp.tensor.usm_ndarray`.
     Input array data types are limited by supported DPNP :ref:`Data types`.
 
     See Also
@@ -251,7 +251,7 @@ def nanargmin(a, axis=None, out=None, *, keepdims=False):
     Limitations
     -----------
     Input and output arrays are only supported as either :class:`dpnp.ndarray`
-    or :class:`dpctl.tensor.usm_ndarray`.
+    or :class:`dpnp.tensor.usm_ndarray`.
     Input array data types are limited by supported DPNP :ref:`Data types`.
 
     See Also
@@ -466,7 +466,7 @@ def nanmax(a, axis=None, out=None, keepdims=False, initial=None, where=True):
     Limitations
     -----------
     Input array is only supported as either :class:`dpnp.ndarray`
-    or :class:`dpctl.tensor.usm_ndarray`.
+    or :class:`dpnp.tensor.usm_ndarray`.
     Parameters `where`, and `initial` are only supported with their default
     values.
     Otherwise ``NotImplementedError`` exception will be raised.
@@ -782,7 +782,7 @@ def nanmin(a, axis=None, out=None, keepdims=False, initial=None, where=True):
     Limitations
     -----------
     Input array is only supported as either :class:`dpnp.ndarray`
-    or :class:`dpctl.tensor.usm_ndarray`.
+    or :class:`dpnp.tensor.usm_ndarray`.
     Parameters `where`, and `initial` are only supported with their default
     values.
     Otherwise ``NotImplementedError`` exception will be raised.
@@ -896,7 +896,7 @@ def nanprod(
     Limitations
     -----------
     Input array is only supported as either :class:`dpnp.ndarray` or
-    :class:`dpctl.tensor.usm_ndarray`.
+    :class:`dpnp.tensor.usm_ndarray`.
     Parameters `initial`, and `where` are only supported with their default
     values.
     Otherwise the function will be executed sequentially on CPU.
diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py
index 6eefe010b699..856fdbc98936 100644
--- a/dpnp/dpnp_iface_searching.py
+++ b/dpnp/dpnp_iface_searching.py
@@ -39,11 +39,14 @@
 
 """
 
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as dti
+# pylint: disable=duplicate-code
 
 import dpnp
 
+# pylint: disable=no-name-in-module
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as dti
+
 from .dpnp_array import dpnp_array
 from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call
 
diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py
index 9c5097a5f3e3..8f6f3e80f0d1 100644
--- a/dpnp/dpnp_iface_sorting.py
+++ b/dpnp/dpnp_iface_sorting.py
@@ -41,10 +41,8 @@
 
 from collections.abc import Sequence
 
-import dpctl.tensor as dpt
-from dpctl.tensor._numpy_helper import normalize_axis_index
-
 import dpnp
+import dpnp.tensor as dpt
 
 # pylint: disable=no-name-in-module
 from .dpnp_algo import (
@@ -54,6 +52,7 @@
 from .dpnp_utils import (
     map_dtype_to_device,
 )
+from .tensor._numpy_helper import normalize_axis_index
 
 
 def _wrap_sort_argsort(
@@ -65,7 +64,7 @@ def _wrap_sort_argsort(
     descending=False,
     stable=True,
 ):
-    """Wrap a sorting call from dpctl.tensor interface."""
+    """Wrap a sorting call from dpnp.tensor interface."""
 
     if order is not None:
         raise NotImplementedError(
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
index 7e092184366c..bf27fc98a4ce 100644
--- a/dpnp/dpnp_iface_statistics.py
+++ b/dpnp/dpnp_iface_statistics.py
@@ -39,26 +39,26 @@
 
 """
 
+# pylint: disable=no-name-in-module
+
 import math
 
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_elementwise_impl as ti
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._numpy_helper import normalize_axis_index
 
 import dpnp
-
-# pylint: disable=no-name-in-module
 import dpnp.backend.extensions.statistics._statistics_impl as statistics_ext
-from dpnp.dpnp_utils.dpnp_utils_common import (
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as ti
+
+from .dpnp_utils import get_usm_allocations
+from .dpnp_utils.dpnp_utils_common import (
     result_type_for_device,
     to_supported_dtypes,
 )
-
-from .dpnp_utils import get_usm_allocations
 from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call
 from .dpnp_utils.dpnp_utils_statistics import dpnp_cov, dpnp_median
+from .tensor._numpy_helper import normalize_axis_index
 
 
 def _count_reduce_items(arr, axis, where=True):
@@ -670,7 +670,7 @@ def _run_native_sliding_dot_product1d(a, v, l_pad, r_pad, rdtype):
     a_casted = dpnp.asarray(a, dtype=supported_dtype, order="C")
     v_casted = dpnp.asarray(v, dtype=supported_dtype, order="C")
 
-    usm_type = dpu.get_coerced_usm_type([a_casted.usm_type, v_casted.usm_type])
+    usm_type = dpt.get_coerced_usm_type([a_casted.usm_type, v_casted.usm_type])
     out_size = l_pad + r_pad + a_casted.size - v_casted.size + 1
     # out type is the same as input type
     out = dpnp.empty_like(a_casted, shape=out_size, usm_type=usm_type)
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
index a46f06c10e08..35428a0416e7 100644
--- a/dpnp/dpnp_iface_trigonometric.py
+++ b/dpnp/dpnp_iface_trigonometric.py
@@ -42,13 +42,11 @@
 # pylint: disable=protected-access
 # pylint: disable=no-name-in-module
 
-
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_elementwise_impl as ti
-import dpctl.tensor._type_utils as dtu
-
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as ti
+import dpnp.tensor._type_utils as dtu
 
 from .dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
 from .dpnp_utils.dpnp_utils_reduction import dpnp_wrap_reduction_call
diff --git a/dpnp/dpnp_iface_types.py b/dpnp/dpnp_iface_types.py
index 8fdb9e1d3d38..d3b295289831 100644
--- a/dpnp/dpnp_iface_types.py
+++ b/dpnp/dpnp_iface_types.py
@@ -37,10 +37,10 @@
 import functools
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 
 import dpnp
+import dpnp.tensor as dpt
 
 from .dpnp_array import dpnp_array
 
diff --git a/dpnp/dpnp_iface_window.py b/dpnp/dpnp_iface_window.py
index f8d6df07443d..bc12e714663c 100644
--- a/dpnp/dpnp_iface_window.py
+++ b/dpnp/dpnp_iface_window.py
@@ -111,7 +111,7 @@ def bartlett(M, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -205,7 +205,7 @@ def blackman(M, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -296,7 +296,7 @@ def hamming(M, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -380,7 +380,7 @@ def hanning(M, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -466,7 +466,7 @@ def kaiser(M, beta, *, device=None, usm_type=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/dpnp_utils/dpnp_algo_utils.pyx b/dpnp/dpnp_utils/dpnp_algo_utils.pyx
index 6ef9c9c28a12..00f40a0358e8 100644
--- a/dpnp/dpnp_utils/dpnp_algo_utils.pyx
+++ b/dpnp/dpnp_utils/dpnp_algo_utils.pyx
@@ -36,13 +36,13 @@ This module contains different helpers and utilities
 """
 
 import dpctl
-import dpctl.utils as dpu
 import numpy
 
 import dpnp
 import dpnp.config as config
 import dpnp.dpnp_container as dpnp_container
 from dpnp.dpnp_array import dpnp_array
+from dpnp.tensor import get_coerced_usm_type, get_execution_queue
 
 cimport cpython
 cimport cython
@@ -153,7 +153,7 @@ def call_origin(function, *args, **kwargs):
         kwargx = convert_item(kwarg)
         kwargs_new[key] = kwargx
 
-    exec_q = dpu.get_execution_queue(alloc_queues)
+    exec_q = get_execution_queue(alloc_queues)
     if exec_q is None:
         exec_q = dpnp.get_normalized_queue_device(sycl_queue=sycl_queue)
     # print(f"DPNP call_origin(): backend called. \n\t function={function}, \n\t args_new={args_new}, \n\t kwargs_new={kwargs_new}, \n\t dpnp_inplace={dpnp_inplace}")
@@ -221,7 +221,7 @@ def _get_coerced_usm_type(objects):
     elif len(types_in_use) == 1:
         return types_in_use[0]
 
-    common_usm_type = dpu.get_coerced_usm_type(types_in_use)
+    common_usm_type = get_coerced_usm_type(types_in_use)
     if common_usm_type is None:
         raise ValueError("Input arrays must have coerced USM types")
     return common_usm_type
@@ -234,7 +234,7 @@ def _get_common_allocation_queue(objects):
     elif len(queues_in_use) == 1:
         return queues_in_use[0]
 
-    common_queue = dpu.get_execution_queue(queues_in_use)
+    common_queue = get_execution_queue(queues_in_use)
     if common_queue is None:
         raise ValueError("Input arrays must be allocated on the same SYCL queue")
     return common_queue
@@ -401,13 +401,13 @@ cdef tuple get_common_usm_allocation(dpnp_descriptor x1, dpnp_descriptor x2):
     array1_obj = x1.get_array()
     array2_obj = x2.get_array()
 
-    common_usm_type = dpctl.utils.get_coerced_usm_type((array1_obj.usm_type, array2_obj.usm_type))
+    common_usm_type = get_coerced_usm_type((array1_obj.usm_type, array2_obj.usm_type))
     if common_usm_type is None:
         raise ValueError(
             "could not recognize common USM type for inputs of USM types {} and {}"
             "".format(array1_obj.usm_type, array2_obj.usm_type))
 
-    common_sycl_queue = dpu.get_execution_queue((array1_obj.sycl_queue, array2_obj.sycl_queue))
+    common_sycl_queue = get_execution_queue((array1_obj.sycl_queue, array2_obj.sycl_queue))
     if common_sycl_queue is None:
         raise ValueError(
             "could not recognize common SYCL queue for inputs in SYCL queues {} and {}"
@@ -532,13 +532,13 @@ cdef class dpnp_descriptor:
         return self.origin_pyobj
 
     def get_array(self):
-        if isinstance(self.origin_pyobj, dpctl.tensor.usm_ndarray):
+        if isinstance(self.origin_pyobj, dpnp.tensor.usm_ndarray):
             return self.origin_pyobj
         if isinstance(self.origin_pyobj, dpnp_array):
             return self.origin_pyobj.get_array()
 
         raise TypeError(
-            "expected either dpctl.tensor.usm_ndarray or dpnp.dpnp_array.dpnp_array, got {}"
+            "expected either dpnp.tensor.usm_ndarray or dpnp.dpnp_array.dpnp_array, got {}"
             "".format(type(self.origin_pyobj)))
 
     cdef void * get_data(self):
diff --git a/dpnp/dpnp_utils/dpnp_utils_common.py b/dpnp/dpnp_utils/dpnp_utils_common.py
index e4bde2e1ec86..55d0f57ca1e2 100644
--- a/dpnp/dpnp_utils/dpnp_utils_common.py
+++ b/dpnp/dpnp_utils/dpnp_utils_common.py
@@ -29,9 +29,8 @@
 
 from collections.abc import Iterable
 
-import dpctl.tensor._type_utils as dtu
-
 import dpnp
+import dpnp.tensor._type_utils as dtu
 from dpnp.dpnp_utils import map_dtype_to_device
 
 __all__ = [
diff --git a/dpnp/dpnp_utils/dpnp_utils_einsum.py b/dpnp/dpnp_utils/dpnp_utils_einsum.py
index 4a1a58635989..b954e3f99467 100644
--- a/dpnp/dpnp_utils/dpnp_utils_einsum.py
+++ b/dpnp/dpnp_utils/dpnp_utils_einsum.py
@@ -31,7 +31,6 @@
 import operator
 import warnings
 
-import dpctl
 import numpy
 
 import dpnp
@@ -1023,7 +1022,7 @@ def dpnp_einsum(
     res_usm_type, exec_q = get_usm_allocations(arrays)
     if out is not None:
         dpnp.check_supported_arrays_type(out)
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+        if dpnp.tensor.get_execution_queue((exec_q, out.sycl_queue)) is None:
             raise ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index d2a1cdfbac46..2331eb7a10cc 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -26,21 +26,22 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._numpy_helper import (
-    normalize_axis_index,
-    normalize_axis_tuple,
-)
 
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
+
+# pylint: disable=no-name-in-module
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations
 from dpnp.exceptions import AxisError, ExecutionPlacementError
+from dpnp.tensor._numpy_helper import (
+    normalize_axis_index,
+    normalize_axis_tuple,
+)
 
 __all__ = [
     "dpnp_cross",
@@ -692,7 +693,7 @@ def _validate_out_array(out, exec_q):
     """Validate out is supported array and has correct queue."""
     if out is not None:
         dpnp.check_supported_arrays_type(out)
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
             raise ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
@@ -769,7 +770,7 @@ def dpnp_dot(a, b, /, out=None, *, casting="same_kind", conjugate=False):
 
     The routine that is used to perform the main calculation
     depends on input arrays data type: 1) For integer and boolean data types,
-    `dpctl.tensor.vecdot` form the Data Parallel Control library is used,
+    `dpnp.tensor.vecdot` form the Data Parallel Control library is used,
     2) For real-valued floating point data types, `dot` routines from
     BLAS library of OneMKL are used, and 3) For complex data types,
     `dotu` or `dotc` routines from BLAS library of OneMKL are used.
@@ -817,7 +818,7 @@ def dpnp_dot(a, b, /, out=None, *, casting="same_kind", conjugate=False):
         _manager.add_event_pair(ht_ev, dot_ev)
     else:
         # oneapi::mkl::blas::dot does not support integer dtypes,
-        # so using dpctl.tensor.vecdot instead
+        # so using dpnp.tensor.vecdot instead
         a_usm = dpnp.get_usm_ndarray(a)
         b_usm = dpnp.get_usm_ndarray(b)
         result = dpnp_array._create_from_usm_ndarray(dpt.vecdot(a_usm, b_usm))
@@ -1116,7 +1117,7 @@ def dpnp_multiplication(
             else:
                 # oneapi::mkl::blas::gemm/gemv do not support integer dtypes,
                 # except for special cases determined in `_gemm_special_case`,
-                # use dpctl.tensor.matmul for unsupported cases
+                # use dpnp.tensor.matmul for unsupported cases
 
                 # `dpt.matmul` does not support `casting` kwarg.
                 # We may need to change input dtypes based on given `casting`.
diff --git a/dpnp/dpnp_utils/dpnp_utils_reduction.py b/dpnp/dpnp_utils/dpnp_utils_reduction.py
index 8c13c6380870..ba9830bd7eff 100644
--- a/dpnp/dpnp_utils/dpnp_utils_reduction.py
+++ b/dpnp/dpnp_utils/dpnp_utils_reduction.py
@@ -33,7 +33,7 @@
 
 
 def dpnp_wrap_reduction_call(usm_a, out, _reduction_fn, res_dt, **kwargs):
-    """Wrap a reduction call from dpctl.tensor interface."""
+    """Wrap a reduction call from dpnp.tensor interface."""
 
     input_out = out
     if out is None:
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index c8414b661851..ac62ddcc2766 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -28,13 +28,11 @@
 
 import warnings
 
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tensor._numpy_helper import normalize_axis_tuple
-
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.dpnp_array import dpnp_array
 from dpnp.exceptions import ExecutionPlacementError
+from dpnp.tensor._numpy_helper import normalize_axis_tuple
 
 __all__ = ["dpnp_cov", "dpnp_median"]
 
@@ -67,7 +65,7 @@ def _calc_nanmedian(a, out=None):
         res = dpnp.empty_like(valid_counts, dtype=a.dtype)
     else:
         dpnp.check_supported_arrays_type(out)
-        exec_q = dpctl.utils.get_execution_queue((a.sycl_queue, out.sycl_queue))
+        exec_q = dpt.get_execution_queue((a.sycl_queue, out.sycl_queue))
         if exec_q is None:
             raise ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
diff --git a/dpnp/exceptions/__init__.py b/dpnp/exceptions/__init__.py
index 26d78a853f41..99587311cf0d 100644
--- a/dpnp/exceptions/__init__.py
+++ b/dpnp/exceptions/__init__.py
@@ -32,10 +32,11 @@
     SyclQueueCreationError,
 )
 from dpctl.memory import USMAllocationError
-from dpctl.tensor._dlpack import DLPackCreationError
-from dpctl.utils import ExecutionPlacementError
 from numpy.exceptions import AxisError
 
+from dpnp.tensor import ExecutionPlacementError
+from dpnp.tensor._dlpack import DLPackCreationError
+
 __all__ = [
     "AxisError",
     "DLPackCreationError",
diff --git a/dpnp/fft/dpnp_iface_fft.py b/dpnp/fft/dpnp_iface_fft.py
index fcc222640c9a..90e1a112bdaf 100644
--- a/dpnp/fft/dpnp_iface_fft.py
+++ b/dpnp/fft/dpnp_iface_fft.py
@@ -263,7 +263,7 @@ def fftfreq(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1581,7 +1581,7 @@ def rfftfreq(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index 28032b9d3be2..733436ab9887 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -41,18 +41,18 @@
 
 from collections.abc import Sequence
 
-import dpctl
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._numpy_helper import (
-    normalize_axis_index,
-    normalize_axis_tuple,
-)
 
 import dpnp
 import dpnp.backend.extensions.fft._fft_impl as fi
+import dpnp.tensor._tensor_impl as ti
 from dpnp.exceptions import ExecutionPlacementError
+from dpnp.tensor import get_execution_queue
+from dpnp.tensor._numpy_helper import (
+    normalize_axis_index,
+    normalize_axis_tuple,
+)
 
 from ..dpnp_array import dpnp_array
 from ..dpnp_utils import map_dtype_to_device
@@ -196,8 +196,8 @@ def _compute_result(dsc, a, out, forward, c2c, out_strides):
         out_usm = None if out is None else dpnp.get_usm_ndarray(out)
         if (
             out is not None
-            and out_usm.strides == tuple(out_strides)
-            and not ti._array_overlap(a_usm, out_usm)
+            and out.strides == tuple(out_strides)
+            and not ti._array_overlap(a_usm, dpnp.get_usm_ndarray(out))
         ):
             res_usm = out_usm
             result = out
@@ -546,10 +546,7 @@ def _validate_out_keyword(a, out, s, axes, c2c, c2r, r2c):
     """Validate out keyword argument."""
     if out is not None:
         dpnp.check_supported_arrays_type(out)
-        if (
-            dpctl.utils.get_execution_queue((a.sycl_queue, out.sycl_queue))
-            is None
-        ):
+        if get_execution_queue((a.sycl_queue, out.sycl_queue)) is None:
             raise ExecutionPlacementError(
                 "Input and output allocation queues are not compatible"
             )
@@ -779,7 +776,7 @@ def dpnp_fillfreq(a, m, n, val):
     """Fill an array with the sample frequencies"""
 
     exec_q = a.sycl_queue
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    _manager = dpu.SequentialOrderManager[exec_q]
 
     # it's assumed there are no dependent events to populate the array
     ht_lin_ev, lin_ev = ti._linspace_step(0, 1, a[:m].get_array(), exec_q)
diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py
index 6959565ecf17..625d387667ac 100644
--- a/dpnp/linalg/dpnp_iface_linalg.py
+++ b/dpnp/linalg/dpnp_iface_linalg.py
@@ -45,10 +45,10 @@
 from typing import NamedTuple
 
 import numpy
-from dpctl.tensor._numpy_helper import normalize_axis_tuple
 
 import dpnp
 from dpnp.backend.extensions.lapack._lapack_impl import LinAlgError
+from dpnp.tensor._numpy_helper import normalize_axis_tuple
 
 from .dpnp_utils_linalg import (
     assert_2d,
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 6881c7787e9f..cf6d1ff231f2 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -42,15 +42,17 @@
 
 from typing import NamedTuple
 
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._numpy_helper import normalize_axis_index
 from numpy import prod
 
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
+
+# pylint: disable=no-name-in-module
+import dpnp.tensor._tensor_impl as ti
 from dpnp.dpnp_utils import get_usm_allocations
+from dpnp.tensor._numpy_helper import normalize_axis_index
 
 
 # pylint:disable=missing-class-docstring
@@ -1262,7 +1264,7 @@ def _real_type(dtype, device=None):
         type is created. `device` can be ``None``, a oneAPI filter selector
         string, an instance of :class:`dpctl.SyclDevice` corresponding to
         a non-partitioned SYCL device, an instance of :class:`dpctl.SyclQueue`,
-        or a :class:`dpctl.tensor.Device` object returned by
+        or a :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/memory/_memory.py b/dpnp/memory/_memory.py
index 70d93c04d6a5..ee0188d33b39 100644
--- a/dpnp/memory/_memory.py
+++ b/dpnp/memory/_memory.py
@@ -26,11 +26,12 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import dpctl.tensor as dpt
 from dpctl.memory import MemoryUSMDevice as DPCTLMemoryUSMDevice
 from dpctl.memory import MemoryUSMHost as DPCTLMemoryUSMHost
 from dpctl.memory import MemoryUSMShared as DPCTLMemoryUSMShared
 
+import dpnp.tensor as dpt
+
 
 def _add_ptr_property(cls):
     _storage_attr = "_ptr"
@@ -76,7 +77,7 @@ def create_data(x):
     Parameters
     ----------
     x : usm_ndarray
-        Input array of :class:`dpctl.tensor.usm_ndarray` type.
+        Input array of :class:`dpnp.tensor.usm_ndarray` type.
 
     Returns
     -------
diff --git a/dpnp/random/dpnp_iface_random.py b/dpnp/random/dpnp_iface_random.py
index 31a82fa5ac7b..3cafe12b1958 100644
--- a/dpnp/random/dpnp_iface_random.py
+++ b/dpnp/random/dpnp_iface_random.py
@@ -839,7 +839,7 @@ def normal(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1100,7 +1100,7 @@ def rand(*args, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1161,7 +1161,7 @@ def randint(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1222,7 +1222,7 @@ def randn(d0, *dn, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1277,7 +1277,7 @@ def random(size=None, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1328,7 +1328,7 @@ def random_integers(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1396,7 +1396,7 @@ def random_sample(size=None, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1446,7 +1446,7 @@ def ranf(size=None, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1537,7 +1537,7 @@ def sample(size=None, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1616,7 +1616,7 @@ def seed(seed=None, device=None, sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1777,7 +1777,7 @@ def standard_normal(size=None, device=None, usm_type="device", sycl_queue=None):
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -1922,7 +1922,7 @@ def uniform(
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
diff --git a/dpnp/random/dpnp_random_state.py b/dpnp/random/dpnp_random_state.py
index e49fe739aedd..9456169ec114 100644
--- a/dpnp/random/dpnp_random_state.py
+++ b/dpnp/random/dpnp_random_state.py
@@ -36,7 +36,6 @@
 
 """
 
-import dpctl.utils as dpu
 import numpy
 
 import dpnp
@@ -46,6 +45,7 @@
     use_origin_backend,
 )
 from dpnp.random.dpnp_algo_random import MCG59, MT19937
+from dpnp.tensor import validate_usm_type
 
 
 class RandomState:
@@ -65,7 +65,7 @@ class RandomState:
         `device` can be ``None``, a oneAPI filter selector string, an instance
         of :class:`dpctl.SyclDevice` corresponding to a non-partitioned SYCL
         device, an instance of :class:`dpctl.SyclQueue`, or a
-        :class:`dpctl.tensor.Device` object returned by
+        :class:`dpnp.tensor.Device` object returned by
         :attr:`dpnp.ndarray.device`.
 
         Default: ``None``.
@@ -269,7 +269,7 @@ def normal(
                         f"scale={scale}, but must be non-negative."
                     )
 
-                dpu.validate_usm_type(usm_type, allow_none=False)
+                validate_usm_type(usm_type, allow_none=False)
                 return self._random_state.normal(
                     loc=loc,
                     scale=scale,
@@ -635,7 +635,7 @@ def uniform(
                 dtype = self._validate_float_dtype(
                     dtype, (dpnp.int32, dpnp.float32, dpnp.float64)
                 )
-                dpu.validate_usm_type(usm_type, allow_none=False)
+                validate_usm_type(usm_type, allow_none=False)
 
                 return self._random_state.uniform(
                     low=low,
diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py
index d083f1c2c0a2..4fe2b9fb32a3 100644
--- a/dpnp/scipy/linalg/_utils.py
+++ b/dpnp/scipy/linalg/_utils.py
@@ -43,11 +43,13 @@
 
 from warnings import warn
 
-import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dpu
 
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
+
+# pylint: disable=no-name-in-module
+import dpnp.tensor._tensor_impl as ti
 from dpnp.dpnp_utils import get_usm_allocations
 from dpnp.linalg.dpnp_utils_linalg import _common_type, _real_type
 
diff --git a/dpnp/tensor/CMakeLists.txt b/dpnp/tensor/CMakeLists.txt
new file mode 100644
index 000000000000..d0fe57cade64
--- /dev/null
+++ b/dpnp/tensor/CMakeLists.txt
@@ -0,0 +1,394 @@
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+find_package(Python COMPONENTS Development.Module)
+
+# Tensor-specific flags
+
+# dpctl doesn't add -fsycl globally
+# only to pybind11 module sources via add_sycl_to_target()
+string(REPLACE "-fsycl " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+# Use LLD linker (dpctl sets this at root level)
+if(UNIX)
+    add_link_options("-fuse-ld=lld")
+endif()
+
+# Remove global coverage flags for tensor
+# use link-time only approach like dpctl
+if(DPNP_GENERATE_COVERAGE)
+    string(REPLACE "-fprofile-instr-generate " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    string(REPLACE "-fcoverage-mapping " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    string(REPLACE "-fno-sycl-use-footer " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+endif()
+
+# Tensor-specific debug flags
+# Disable device code debug info for Debug and Coverage builds to speed up linking
+if(
+    CMAKE_BUILD_TYPE STREQUAL "Debug"
+    OR CMAKE_BUILD_TYPE STREQUAL "DEBUG"
+    OR CMAKE_BUILD_TYPE STREQUAL "Coverage"
+)
+    if(WIN32)
+        add_compile_options(-Xsycl-target-frontend=spir64 "-g0")
+    elseif(UNIX)
+        add_compile_options(-Xsycl-target-frontend=spir64 "-g0")
+        if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "DEBUG")
+            string(REPLACE "-g1" "-g" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
+            string(REPLACE "-g1" "-g" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
+        endif()
+    endif()
+endif()
+
+# Match dpctl warning flags
+# Suppress unused parameter warnings
+add_compile_options(-Wno-unused-parameter)
+
+file(GLOB _cython_sources *.pyx)
+foreach(_cy_file ${_cython_sources})
+    get_filename_component(_trgt ${_cy_file} NAME_WLE)
+    build_dpnp_tensor_ext(${_trgt} ${_cy_file} "dpnp/tensor" RELATIVE_PATH "..")
+    target_include_directories(${_trgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+endforeach()
+
+if(WIN32)
+    if(${CMAKE_VERSION} VERSION_LESS "3.27")
+        # this is a work-around for target_link_options inserting option after -link option, cause
+        # linker to ignore it.
+        set(CMAKE_CXX_LINK_FLAGS
+            "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel"
+        )
+    endif()
+endif()
+
+# TODO: reuse this library for dpnp ufunc extension build
+set(_static_lib_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
+)
+set(_tensor_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
+)
+set(_accumulator_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/accumulators_common.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp
+)
+set(_elementwise_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_common.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acos.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acosh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/add.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/angle.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asinh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atanh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_and.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_invert.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_or.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_xor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cbrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/ceil.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/conj.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cos.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/expm1.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor_divide.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater_equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/hypot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/imag.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isfinite.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isinf.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isnan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less_equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log1p.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log10.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logaddexp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_not.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_xor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/maximum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/minimum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/multiply.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/negative.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/nextafter.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/positive.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/proj.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/real.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/reciprocal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/remainder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/round.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/rsqrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sign.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/signbit.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sinh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/square.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tan.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tanh.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/trunc.cpp
+)
+set(_reduction_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/all.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp
+)
+set(_sorting_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp
+)
+set(_linalg_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linalg_functions/dot.cpp
+)
+set(_tensor_accumulation_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp
+    ${_accumulator_sources}
+)
+set(_tensor_elementwise_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_elementwise.cpp
+    ${_elementwise_sources}
+)
+set(_tensor_reductions_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp
+    ${_reduction_sources}
+)
+set(_tensor_sorting_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp
+    ${_sorting_sources}
+)
+set(_tensor_linalg_impl_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_linalg.cpp
+    ${_linalg_sources}
+)
+
+set(_static_lib_trgt simplify_iteration_space)
+
+add_library(${_static_lib_trgt} STATIC ${_static_lib_sources})
+target_include_directories(
+    ${_static_lib_trgt}
+    PRIVATE
+        # ${Python_INCLUDE_DIRS}
+        # ${Dpctl_INCLUDE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
+)
+target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Module)
+set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+set(_py_trgts)
+
+set(python_module_name _tensor_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
+set(python_module_name _tensor_accumulation_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_accumulation_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
+set(python_module_name _tensor_elementwise_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_elementwise_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_elementwise_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
+set(python_module_name _tensor_reductions_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
+set(python_module_name _tensor_sorting_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
+set(python_module_name _tensor_linalg_impl)
+pybind11_add_module(${python_module_name} MODULE ${_tensor_linalg_impl_sources})
+add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_linalg_impl_sources})
+target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
+list(APPEND _py_trgts ${python_module_name})
+
+set(_clang_prefix "")
+if(WIN32)
+    set(_clang_prefix "/clang:")
+endif()
+
+set(_no_fast_math_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
+)
+list(
+    APPEND _no_fast_math_sources
+    ${_elementwise_sources}
+    ${_reduction_sources}
+    ${_sorting_sources}
+    ${_linalg_sources}
+    ${_accumulator_sources}
+)
+
+foreach(_src_fn ${_no_fast_math_sources})
+    get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS)
+    set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math")
+    set_source_files_properties(
+        ${_src_fn}
+        PROPERTIES COMPILE_OPTIONS "${_combined_options_prop}"
+    )
+endforeach()
+
+set(_compiler_definitions "")
+
+foreach(_src_fn ${_elementwise_sources})
+    get_source_file_property(_cmpl_options_defs ${_src_fn} COMPILE_DEFINITIONS)
+    if(${_cmpl_options_defs})
+        set(_combined_options_defs ${_cmpl_options_defs} "${_compiler_definitions}")
+    else()
+        set(_combined_options_defs "${_compiler_definitions}")
+    endif()
+    set_source_files_properties(
+        ${_src_fn}
+        PROPERTIES COMPILE_DEFINITIONS "${_combined_options_defs}"
+    )
+endforeach()
+
+set(_linker_options "LINKER:${DPNP_LDFLAGS}")
+foreach(python_module_name ${_py_trgts})
+    target_compile_options(
+        ${python_module_name}
+        PRIVATE -fno-sycl-id-queries-fit-in-int
+    )
+    target_link_options(
+        ${python_module_name}
+        PRIVATE -fsycl-device-code-split=per_kernel
+    )
+    if(DPNP_TENSOR_OFFLOAD_COMPRESS)
+        target_link_options(${python_module_name} PRIVATE --offload-compress)
+    endif()
+
+    target_include_directories(
+        ${python_module_name}
+        PRIVATE
+            ${CMAKE_SOURCE_DIR}/dpnp/backend/include
+            ${Dpctl_INCLUDE_DIR}
+            ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
+            ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
+            ${CMAKE_BINARY_DIR} # For generated Cython headers
+    )
+    target_link_options(${python_module_name} PRIVATE ${_linker_options})
+    if(DPNP_GENERATE_COVERAGE)
+        if(DPNP_TENSOR_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS)
+            target_compile_options(
+                ${python_module_name}
+                PRIVATE -fprofile-instr-generate -fcoverage-mapping
+            )
+        endif()
+        target_link_options(
+            ${python_module_name}
+            PRIVATE -fprofile-instr-generate -fcoverage-mapping
+        )
+    endif()
+    if(_dpnp_sycl_targets)
+        # make fat binary
+        target_compile_options(
+            ${python_module_name}
+            PRIVATE ${_dpnp_sycl_target_compile_options}
+        )
+        target_link_options(
+            ${python_module_name}
+            PRIVATE ${_dpnp_sycl_target_link_options}
+        )
+    endif()
+    # Ensure Cython modules build first so _usmarray.h exists
+    add_dependencies(${python_module_name} _usmarray)
+    if(DPNP_WITH_REDIST)
+        set_target_properties(
+            ${python_module_name}
+            PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.."
+        )
+    endif()
+    install(TARGETS ${python_module_name} DESTINATION "dpnp/tensor")
+endforeach()
diff --git a/dpnp/tensor/__init__.pxd b/dpnp/tensor/__init__.pxd
new file mode 100644
index 000000000000..a4bcecfec1d1
--- /dev/null
+++ b/dpnp/tensor/__init__.pxd
@@ -0,0 +1,36 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+""" This file declares the extension types and functions for the Cython API
+    implemented in _usmarray.pyx file.
+"""
+
+# distutils: language = c++
+# cython: language_level=3
+
+from ._usmarray cimport *
diff --git a/dpnp/tensor/__init__.py b/dpnp/tensor/__init__.py
new file mode 100644
index 000000000000..0118e04f7ab1
--- /dev/null
+++ b/dpnp/tensor/__init__.py
@@ -0,0 +1,425 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
+from ._array_api import __array_api_version__, __array_namespace_info__
+from ._clip import clip
+from ._compute_follows_data import (
+    ExecutionPlacementError,
+    get_coerced_usm_type,
+    get_execution_queue,
+    validate_usm_type,
+)
+from ._constants import e, inf, nan, newaxis, pi
+from ._copy_utils import (
+    asnumpy,
+    astype,
+    copy,
+    from_numpy,
+    to_numpy,
+)
+from ._ctors import (
+    arange,
+    asarray,
+    empty,
+    empty_like,
+    eye,
+    full,
+    full_like,
+    linspace,
+    meshgrid,
+    ones,
+    ones_like,
+    tril,
+    triu,
+    zeros,
+    zeros_like,
+)
+from ._data_types import (
+    bool,
+    complex64,
+    complex128,
+    dtype,
+    float16,
+    float32,
+    float64,
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+)
+from ._device import Device
+from ._dldevice_conversions import (
+    dldevice_to_sycl_device,
+    sycl_device_to_dldevice,
+)
+from ._dlpack import from_dlpack
+from ._elementwise_funcs import (
+    abs,
+    acos,
+    acosh,
+    add,
+    angle,
+    asin,
+    asinh,
+    atan,
+    atan2,
+    atanh,
+    bitwise_and,
+    bitwise_invert,
+    bitwise_left_shift,
+    bitwise_or,
+    bitwise_right_shift,
+    bitwise_xor,
+    cbrt,
+    ceil,
+    conj,
+    copysign,
+    cos,
+    cosh,
+    divide,
+    equal,
+    exp,
+    exp2,
+    expm1,
+    floor,
+    floor_divide,
+    greater,
+    greater_equal,
+    hypot,
+    imag,
+    isfinite,
+    isinf,
+    isnan,
+    less,
+    less_equal,
+    log,
+    log1p,
+    log2,
+    log10,
+    logaddexp,
+    logical_and,
+    logical_not,
+    logical_or,
+    logical_xor,
+    maximum,
+    minimum,
+    multiply,
+    negative,
+    nextafter,
+    not_equal,
+    positive,
+    pow,
+    proj,
+    real,
+    reciprocal,
+    remainder,
+    round,
+    rsqrt,
+    sign,
+    signbit,
+    sin,
+    sinh,
+    sqrt,
+    square,
+    subtract,
+    tan,
+    tanh,
+    trunc,
+)
+from ._indexing_functions import (
+    extract,
+    nonzero,
+    place,
+    put,
+    put_along_axis,
+    take,
+    take_along_axis,
+)
+from ._linear_algebra_functions import (
+    matmul,
+    matrix_transpose,
+    tensordot,
+    vecdot,
+)
+from ._manipulation_functions import (
+    broadcast_arrays,
+    broadcast_to,
+    concat,
+    expand_dims,
+    flip,
+    moveaxis,
+    permute_dims,
+    repeat,
+    roll,
+    squeeze,
+    stack,
+    swapaxes,
+    tile,
+    unstack,
+)
+from ._print import (
+    get_print_options,
+    print_options,
+    set_print_options,
+    usm_ndarray_repr,
+    usm_ndarray_str,
+)
+from ._reduction import (
+    argmax,
+    argmin,
+    count_nonzero,
+    logsumexp,
+    max,
+    min,
+    prod,
+    reduce_hypot,
+    sum,
+)
+
+# isort: off
+# placed here to avoid circular import
+from ._usmarray import DLDeviceType, usm_ndarray
+
+# isort: on
+from ._reshape import reshape
+from ._search_functions import where
+from ._searchsorted import searchsorted
+from ._set_functions import (
+    isin,
+    unique_all,
+    unique_counts,
+    unique_inverse,
+    unique_values,
+)
+from ._sorting import argsort, sort, top_k
+from ._statistical_functions import mean, std, var
+from ._testing import allclose
+from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
+from ._utility_functions import all, any, diff
+
+__all__ = [
+    "Device",
+    "DLDeviceType",
+    "usm_ndarray",
+    # data types
+    "bool",
+    "dtype",
+    "int8",
+    "uint8",
+    "int16",
+    "uint16",
+    "int32",
+    "uint32",
+    "int64",
+    "uint64",
+    "float16",
+    "float32",
+    "float64",
+    "complex64",
+    "complex128",
+    # constants
+    "e",
+    "inf",
+    "nan",
+    "newaxis",
+    "pi",
+    # functions
+    "abs",
+    "acos",
+    "acosh",
+    "add",
+    "all",
+    "allclose",
+    "angle",
+    "any",
+    "arange",
+    "argmax",
+    "argmin",
+    "argsort",
+    "asarray",
+    "asin",
+    "asinh",
+    "asnumpy",
+    "astype",
+    "atan",
+    "atanh",
+    "atan2",
+    "bitwise_and",
+    "bitwise_invert",
+    "bitwise_left_shift",
+    "bitwise_or",
+    "bitwise_right_shift",
+    "bitwise_xor",
+    "broadcast_arrays",
+    "broadcast_to",
+    "can_cast",
+    "cbrt",
+    "ceil",
+    "concat",
+    "conj",
+    "copy",
+    "copysign",
+    "cos",
+    "cosh",
+    "count_nonzero",
+    "clip",
+    "cumulative_logsumexp",
+    "cumulative_prod",
+    "cumulative_sum",
+    "diff",
+    "divide",
+    "dldevice_to_sycl_device",
+    "empty",
+    "empty_like",
+    "equal",
+    "extract",
+    "expand_dims",
+    "eye",
+    "exp",
+    "exp2",
+    "expm1",
+    "finfo",
+    "flip",
+    "floor",
+    "floor_divide",
+    "from_dlpack",
+    "from_numpy",
+    "full",
+    "full_like",
+    "get_print_options",
+    "greater",
+    "greater_equal",
+    "hypot",
+    "iinfo",
+    "imag",
+    "isfinite",
+    "isinf",
+    "isdtype",
+    "isin",
+    "isnan",
+    "less",
+    "less_equal",
+    "linspace",
+    "log",
+    "logaddexp",
+    "logical_and",
+    "logical_not",
+    "logical_or",
+    "logical_xor",
+    "logsumexp",
+    "log1p",
+    "log2",
+    "log10",
+    "max",
+    "maximum",
+    "mean",
+    "meshgrid",
+    "min",
+    "minimum",
+    "moveaxis",
+    "multiply",
+    "permute_dims",
+    "matmul",
+    "matrix_transpose",
+    "negative",
+    "nextafter",
+    "nonzero",
+    "not_equal",
+    "ones",
+    "ones_like",
+    "place",
+    "positive",
+    "pow",
+    "print_options",
+    "prod",
+    "proj",
+    "put",
+    "put_along_axis",
+    "real",
+    "reciprocal",
+    "reduce_hypot",
+    "remainder",
+    "repeat",
+    "reshape",
+    "result_type",
+    "roll",
+    "round",
+    "rsqrt",
+    "searchsorted",
+    "set_print_options",
+    "sign",
+    "signbit",
+    "sin",
+    "sinh",
+    "sort",
+    "sqrt",
+    "square",
+    "squeeze",
+    "stack",
+    "std",
+    "subtract",
+    "sum",
+    "swapaxes",
+    "sycl_device_to_dldevice",
+    "take",
+    "take_along_axis",
+    "tan",
+    "tanh",
+    "tensordot",
+    "tile",
+    "top_k",
+    "to_numpy",
+    "tril",
+    "triu",
+    "trunc",
+    "unique_all",
+    "unique_counts",
+    "unique_inverse",
+    "unique_values",
+    "unstack",
+    "usm_ndarray_repr",
+    "usm_ndarray_str",
+    "var",
+    "vecdot",
+    "where",
+    "zeros",
+    "zeros_like",
+    "__array_api_version__",
+    "__array_namespace_info__",
+    # utilities
+    "ExecutionPlacementError",
+    "get_coerced_usm_type",
+    "get_execution_queue",
+    "validate_usm_type",
+]
diff --git a/dpnp/tensor/_accumulation.py b/dpnp/tensor/_accumulation.py
new file mode 100644
index 000000000000..069eb870f783
--- /dev/null
+++ b/dpnp/tensor/_accumulation.py
@@ -0,0 +1,466 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_accumulation_impl as tai
+import dpnp.tensor._tensor_impl as ti
+
+from ._numpy_helper import normalize_axis_index
+from ._type_utils import (
+    _default_accumulation_dtype,
+    _default_accumulation_dtype_fp_types,
+    _to_device_supported_dtype,
+)
+
+
+def _accumulate_common(
+    x,
+    axis,
+    dtype,
+    include_initial,
+    out,
+    _accumulate_fn,
+    _accumulate_include_initial_fn,
+    _dtype_supported,
+    _default_accumulation_type_fn,
+):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+    appended_axis = False
+    if x.ndim == 0:
+        x = x[dpt.newaxis]
+        appended_axis = True
+    nd = x.ndim
+    if axis is None:
+        if nd > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(nd)
+            )
+        axis = 0
+    else:
+        axis = normalize_axis_index(axis, nd, "axis")
+    sh = x.shape
+    res_sh = (
+        sh[:axis] + (sh[axis] + 1,) + sh[axis + 1 :] if include_initial else sh
+    )
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt.permute_dims(x, perm)
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    res_usm_type = x.usm_type
+    if dtype is None:
+        res_dt = _default_accumulation_type_fn(inp_dt, q)
+    else:
+        res_dt = dpt.dtype(dtype)
+        res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
+
+    # checking now avoids unnecessary allocations
+    implemented_types = _dtype_supported(inp_dt, res_dt)
+    if dtype is None and not implemented_types:
+        raise RuntimeError(
+            "Automatically determined accumulation data type does not "
+            "have direct implementation"
+        )
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        out_sh = out.shape
+        # append an axis to `out` if scalar
+        if appended_axis and not include_initial:
+            out = out[dpt.newaxis, ...]
+            orig_out = out
+            final_res_sh = res_sh[1:]
+        else:
+            final_res_sh = res_sh
+        if not out_sh == final_res_sh:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_sh}, got {out_sh}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, " f"got {out.dtype}"
+            )
+        if dpt.get_execution_queue((q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        # permute out array dims if necessary
+        if a1 != nd:
+            out = dpt.permute_dims(out, perm)
+            orig_out = out
+        if ti._array_overlap(x, out) and implemented_types:
+            out = dpt.empty_like(out)
+    else:
+        out = dpt.empty(
+            res_sh, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        if a1 != nd:
+            out = dpt.permute_dims(out, perm)
+
+    _manager = SequentialOrderManager[q]
+    depends = _manager.submitted_events
+    if implemented_types:
+        if not include_initial:
+            ht_e, acc_ev = _accumulate_fn(
+                src=arr,
+                trailing_dims_to_accumulate=1,
+                dst=out,
+                sycl_queue=q,
+                depends=depends,
+            )
+        else:
+            ht_e, acc_ev = _accumulate_include_initial_fn(
+                src=arr, dst=out, sycl_queue=q, depends=depends
+            )
+        _manager.add_event_pair(ht_e, acc_ev)
+        if not (orig_out is None or out is orig_out):
+            # Copy the out data from temporary buffer to original memory
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=q, depends=[acc_ev]
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            out = orig_out
+    else:
+        if _dtype_supported(res_dt, res_dt):
+            tmp = dpt.empty(
+                arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=depends
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            if not include_initial:
+                ht_e, acc_ev = _accumulate_fn(
+                    src=tmp,
+                    trailing_dims_to_accumulate=1,
+                    dst=out,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            else:
+                ht_e, acc_ev = _accumulate_include_initial_fn(
+                    src=tmp,
+                    dst=out,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            _manager.add_event_pair(ht_e, acc_ev)
+        else:
+            buf_dt = _default_accumulation_type_fn(inp_dt, q)
+            tmp = dpt.empty(
+                arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=depends
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            tmp_res = dpt.empty(
+                res_sh, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            if a1 != nd:
+                tmp_res = dpt.permute_dims(tmp_res, perm)
+            if not include_initial:
+                ht_e, acc_ev = _accumulate_fn(
+                    src=tmp,
+                    trailing_dims_to_accumulate=1,
+                    dst=tmp_res,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            else:
+                ht_e, acc_ev = _accumulate_include_initial_fn(
+                    src=tmp,
+                    dst=tmp_res,
+                    sycl_queue=q,
+                    depends=[cpy_e],
+                )
+            _manager.add_event_pair(ht_e, acc_ev)
+            ht_e_cpy2, cpy_e2 = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=tmp_res, dst=out, sycl_queue=q, depends=[acc_ev]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy_e2)
+
+    if appended_axis:
+        out = dpt.squeeze(out)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt.permute_dims(out, inv_perm)
+
+    return out
+
+
+def cumulative_sum(
+    x, /, *, axis=None, dtype=None, include_initial=False, out=None
+):
+    """
+    cumulative_sum(x, /, *, axis=None, dtype=None, include_initial=False,
+                   out=None)
+
+    Calculates the cumulative sum of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which cumulative sum must be computed.
+            If `None`, the sum is computed over the entire array.
+            If `x` is a one-dimensional array, providing an `axis` is optional;
+            however, if `x` has more than one dimension, providing an `axis`
+            is required.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+
+                * If `x` has a real- or complex-valued floating-point data
+                  type, the returned array will have the same data type as
+                  `x`.
+                * If `x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the cumulative sum.
+            Default: `None`.
+        include_initial (bool):
+            boolean indicating whether to include the initial value (i.e., the
+            additive identity, zero) as the first value along the provided axis
+            in the output. Default: `False`.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of `out` must match the expected shape and the
+            expected data type of the result or (if provided) `dtype`.
+            If `None` then a new array is returned. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            an array containing cumulative sums. The returned array has the data
+            type as described in the `dtype` parameter description above.
+
+            The returned array shape is determined as follows:
+
+                * If `include_initial` is `False`, the returned array will
+                  have the same shape as `x`
+                * If `include_initial` is `True`, the returned array will
+                  have the same shape as `x` except the axis along which the
+                  cumulative sum is calculated, which will have size `N+1`
+
+            where `N` is the size of the axis the cumulative sums are computed
+            along.
+    """
+    return _accumulate_common(
+        x,
+        axis,
+        dtype,
+        include_initial,
+        out,
+        tai._cumsum_over_axis,
+        tai._cumsum_final_axis_include_initial,
+        tai._cumsum_dtype_supported,
+        _default_accumulation_dtype,
+    )
+
+
+def cumulative_prod(
+    x, /, *, axis=None, dtype=None, include_initial=False, out=None
+):
+    """
+    cumulative_prod(x, /, *, axis=None, dtype=None, include_initial=False,
+                   out=None)
+
+    Calculates the cumulative product of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which cumulative product must be computed.
+            If `None`, the product is computed over the entire array.
+            If `x` is a one-dimensional array, providing an `axis` is optional;
+            however, if `x` has more than one dimension, providing an `axis`
+            is required.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+
+                * If `x` has a real- or complex-valued floating-point data
+                  type, the returned array will have the same data type as
+                  `x`.
+                * If `x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the cumulative product.
+            Default: `None`.
+        include_initial (bool):
+            boolean indicating whether to include the initial value (i.e., the
+            additive identity, zero) as the first value along the provided
+            axis in the output. Default: `False`.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of `out` must match the expected shape and the
+            expected data type of the result or (if provided) `dtype`.
+            If `None` then a new array is returned. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            an array containing cumulative products. The returned array has
+            the data type as described in the `dtype` parameter description
+            above.
+
+            The returned array shape is determined as follows:
+
+                * If `include_initial` is `False`, the returned array will
+                  have the same shape as `x`
+                * If `include_initial` is `True`, the returned array will
+                  have the same shape as `x` except the axis along which the
+                  cumulative product is calculated, which will have size `N+1`
+
+            where `N` is the size of the axis the cumulative products are
+            computed along.
+    """
+    return _accumulate_common(
+        x,
+        axis,
+        dtype,
+        include_initial,
+        out,
+        tai._cumprod_over_axis,
+        tai._cumprod_final_axis_include_initial,
+        tai._cumprod_dtype_supported,
+        _default_accumulation_dtype,
+    )
+
+
+def cumulative_logsumexp(
+    x, /, *, axis=None, dtype=None, include_initial=False, out=None
+):
+    """
+    cumulative_logsumexp(x, /, *, axis=None, dtype=None, include_initial=False,
+                   out=None)
+
+    Calculates the cumulative logsmumexp of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which cumulative logsumexp must be computed.
+            If `None`, the logsumexp is computed over the entire array.
+            If `x` is a one-dimensional array, providing an `axis` is optional;
+            however, if `x` has more than one dimension, providing an `axis`
+            is required.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+
+                * If `x` has a real- or complex-valued floating-point data
+                  type, the returned array will have the same data type as
+                  `x`.
+                * If `x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the cumulative logsumexp.
+            Default: `None`.
+        include_initial (bool):
+            boolean indicating whether to include the initial value (i.e., the
+            additive identity, zero) as the first value along the provided axis
+            in the output. Default: `False`.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of `out` must match the expected shape and the
+            expected data type of the result or (if provided) `dtype`.
+            If `None` then a new array is returned. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            an array containing cumulative logsumexp results. The returned
+            array has the data type as described in the `dtype` parameter
+            description above.
+
+            The returned array shape is determined as follows:
+
+                * If `include_initial` is `False`, the returned array will
+                  have the same shape as `x`
+                * If `include_initial` is `True`, the returned array will
+                  have the same shape as `x` except the axis along which the
+                  cumulative logsumexp is calculated, which will have size
+                  `N+1`
+    """
+    return _accumulate_common(
+        x,
+        axis,
+        dtype,
+        include_initial,
+        out,
+        tai._cumlogsumexp_over_axis,
+        tai._cumlogsumexp_final_axis_include_initial,
+        tai._cumlogsumexp_dtype_supported,
+        _default_accumulation_dtype_fp_types,
+    )
diff --git a/dpnp/tensor/_array_api.py b/dpnp/tensor/_array_api.py
new file mode 100644
index 000000000000..a18bc2be1824
--- /dev/null
+++ b/dpnp/tensor/_array_api.py
@@ -0,0 +1,254 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+
+import dpnp.tensor as dpt
+
+from ._tensor_impl import (
+    default_device_complex_type,
+    default_device_fp_type,
+    default_device_index_type,
+    default_device_int_type,
+)
+
+
+def _isdtype_impl(dtype, kind):
+    if isinstance(kind, str):
+        if kind == "bool":
+            return dtype.kind == "b"
+        elif kind == "signed integer":
+            return dtype.kind == "i"
+        elif kind == "unsigned integer":
+            return dtype.kind == "u"
+        elif kind == "integral":
+            return dtype.kind in "iu"
+        elif kind == "real floating":
+            return dtype.kind == "f"
+        elif kind == "complex floating":
+            return dtype.kind == "c"
+        elif kind == "numeric":
+            return dtype.kind in "iufc"
+        else:
+            raise ValueError(f"Unrecognized data type kind: {kind}")
+
+    elif isinstance(kind, tuple):
+        return any(_isdtype_impl(dtype, k) for k in kind)
+    else:
+        raise TypeError(f"Unsupported type for dtype kind: {type(kind)}")
+
+
+def _get_device_impl(d):
+    if d is None:
+        return dpctl.select_default_device()
+    elif isinstance(d, dpctl.SyclDevice):
+        return d
+    elif isinstance(d, (dpt.Device, dpctl.SyclQueue)):
+        return d.sycl_device
+    else:
+        try:
+            return dpctl.SyclDevice(d)
+        except TypeError:
+            raise TypeError(f"Unsupported type for device argument: {type(d)}")
+
+
+__array_api_version__ = "2024.12"
+
+
+class Info:
+    """namespace returned by ``__array_namespace_info__()``"""
+
+    def __init__(self):
+        self._capabilities = {
+            "boolean indexing": True,
+            "data-dependent shapes": True,
+            "max dimensions": None,
+        }
+        self._all_dtypes = {
+            "bool": dpt.bool,
+            "float32": dpt.float32,
+            "float64": dpt.float64,
+            "complex64": dpt.complex64,
+            "complex128": dpt.complex128,
+            "int8": dpt.int8,
+            "int16": dpt.int16,
+            "int32": dpt.int32,
+            "int64": dpt.int64,
+            "uint8": dpt.uint8,
+            "uint16": dpt.uint16,
+            "uint32": dpt.uint32,
+            "uint64": dpt.uint64,
+        }
+
+    def capabilities(self):
+        """
+        capabilities()
+
+        Returns a dictionary of ``dpctl``'s capabilities.
+
+        The dictionary contains the following keys:
+            ``"boolean indexing"``:
+                boolean indicating ``dpctl``'s support of boolean indexing.
+                Value: ``True``
+            ``"data-dependent shapes"``:
+                boolean indicating ``dpctl``'s support of data-dependent shapes.
+                Value: ``True``
+            ``max dimensions``:
+                integer indication the maximum array dimension supported by ``dpctl``.
+                Value: ``None``
+
+        Returns:
+            dict:
+                dictionary of ``dpctl``'s capabilities
+        """
+        return self._capabilities.copy()
+
+    def default_device(self):
+        """
+        default_device()
+
+        Returns the default SYCL device.
+        """
+        return dpctl.select_default_device()
+
+    def default_dtypes(self, *, device=None):
+        """
+        default_dtypes(*, device=None)
+
+        Returns a dictionary of default data types for ``device``.
+
+        Args:
+            device (Optional[:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`, :class:`dpctl.tensor.Device`, str]):
+                array API concept of device used in getting default data types.
+                ``device`` can be ``None`` (in which case the default device
+                is used), an instance of :class:`dpctl.SyclDevice`, an instance
+                of :class:`dpctl.SyclQueue`, a :class:`dpctl.tensor.Device`
+                object returned by :attr:`dpctl.tensor.usm_ndarray.device`, or
+                a filter selector string.
+                Default: ``None``.
+
+        Returns:
+            dict:
+                a dictionary of default data types for ``device``:
+
+                    - ``"real floating"``: dtype
+                    - ``"complex floating"``: dtype
+                    - ``"integral"``: dtype
+                    - ``"indexing"``: dtype
+        """
+        device = _get_device_impl(device)
+        return {
+            "real floating": dpt.dtype(default_device_fp_type(device)),
+            "complex floating": dpt.dtype(default_device_complex_type(device)),
+            "integral": dpt.dtype(default_device_int_type(device)),
+            "indexing": dpt.dtype(default_device_index_type(device)),
+        }
+
+    def dtypes(self, *, device=None, kind=None):
+        """
+        dtypes(*, device=None, kind=None)
+
+        Returns a dictionary of all Array API data types of a specified
+        ``kind`` supported by ``device``.
+
+        This dictionary only includes data types supported by the
+        `Python Array API <https://data-apis.org/array-api/latest/>`_
+        specification.
+
+        Args:
+            device (Optional[:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`, :class:`dpctl.tensor.Device`, str]):
+                array API concept of device used in getting default data types.
+                ``device`` can be ``None`` (in which case the default device is
+                used), an instance of :class:`dpctl.SyclDevice`, an instance of
+                :class:`dpctl.SyclQueue`, a :class:`dpctl.tensor.Device`
+                object returned by :attr:`dpctl.tensor.usm_ndarray.device`, or
+                a filter selector string.
+                Default: ``None``.
+
+            kind (Optional[str, Tuple[str, ...]]):
+                data type kind.
+
+                - if ``kind`` is ``None``, returns a dictionary of all data
+                  types supported by `device`
+                - if ``kind`` is a string, returns a dictionary containing the
+                  data types belonging to the data type kind specified.
+
+                  Supports:
+
+                  * ``"bool"``
+                  * ``"signed integer"``
+                  * ``"unsigned integer"``
+                  * ``"integral"``
+                  * ``"real floating"``
+                  * ``"complex floating"``
+                  * ``"numeric"``
+
+                - if ``kind`` is a tuple, the tuple represents a union of
+                  ``kind`` strings, and returns a dictionary containing data
+                  types corresponding to the-specified union.
+
+                Default: ``None``.
+
+        Returns:
+            dict:
+                a dictionary of the supported data types of the specified
+                ``kind``
+        """
+        device = _get_device_impl(device)
+        _fp64 = device.has_aspect_fp64
+        if kind is None:
+            return {
+                key: val
+                for key, val in self._all_dtypes.items()
+                if _fp64 or (key != "float64" and key != "complex128")
+            }
+        else:
+            return {
+                key: val
+                for key, val in self._all_dtypes.items()
+                if (_fp64 or (key != "float64" and key != "complex128"))
+                and _isdtype_impl(val, kind)
+            }
+
+    def devices(self):
+        """
+        devices()
+
+        Returns a list of supported devices.
+        """
+        return dpctl.get_devices()
+
+
+def __array_namespace_info__():
+    """
+    __array_namespace_info__()
+
+    Returns a namespace with Array API namespace inspection utilities.
+
+    """
+    return Info()
diff --git a/dpnp/tensor/_clip.py b/dpnp/tensor/_clip.py
new file mode 100644
index 000000000000..44434fc0bb0c
--- /dev/null
+++ b/dpnp/tensor/_clip.py
@@ -0,0 +1,771 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as tei
+import dpnp.tensor._tensor_impl as ti
+
+from ._copy_utils import (
+    _empty_like_orderK,
+    _empty_like_pair_orderK,
+    _empty_like_triple_orderK,
+)
+from ._manipulation_functions import _broadcast_shape_impl
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
+from ._type_utils import (
+    _can_cast,
+    _resolve_one_strong_one_weak_types,
+    _resolve_one_strong_two_weak_types,
+)
+
+
+def _check_clip_dtypes(res_dtype, arg1_dtype, arg2_dtype, sycl_dev):
+    """
+    Checks if both types `arg1_dtype` and `arg2_dtype` can be
+    cast to `res_dtype` according to the rule `safe`
+    """
+    if arg1_dtype == res_dtype and arg2_dtype == res_dtype:
+        return None, None, res_dtype
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if _can_cast(arg1_dtype, res_dtype, _fp16, _fp64) and _can_cast(
+        arg2_dtype, res_dtype, _fp16, _fp64
+    ):
+        # prevent unnecessary casting
+        ret_buf1_dt = None if res_dtype == arg1_dtype else res_dtype
+        ret_buf2_dt = None if res_dtype == arg2_dtype else res_dtype
+        return ret_buf1_dt, ret_buf2_dt, res_dtype
+    else:
+        return None, None, None
+
+
+def _clip_none(x, val, out, order, _binary_fn):
+    q1, x_usm_type = x.sycl_queue, x.usm_type
+    q2, val_usm_type = _get_queue_usm_type(val)
+    if q2 is None:
+        exec_q = q1
+        res_usm_type = x_usm_type
+    else:
+        exec_q = dpt.get_execution_queue((q1, q2))
+        if exec_q is None:
+            raise dpt.ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        res_usm_type = dpt.get_coerced_usm_type(
+            (
+                x_usm_type,
+                val_usm_type,
+            )
+        )
+    dpt.validate_usm_type(res_usm_type, allow_none=False)
+    x_shape = x.shape
+    val_shape = _get_shape(val)
+    if not isinstance(val_shape, (tuple, list)):
+        raise TypeError(
+            "Shape of arguments can not be inferred. "
+            "Arguments are expected to be "
+            "lists, tuples, or both"
+        )
+    try:
+        res_shape = _broadcast_shape_impl(
+            [
+                x_shape,
+                val_shape,
+            ]
+        )
+    except ValueError:
+        raise ValueError(
+            "operands could not be broadcast together with shapes "
+            f"{x_shape} and {val_shape}"
+        )
+    sycl_dev = exec_q.sycl_device
+    x_dtype = x.dtype
+    val_dtype = _get_dtype(val, sycl_dev)
+    if not _validate_dtype(val_dtype):
+        raise ValueError("Operands have unsupported data types")
+
+    val_dtype = _resolve_one_strong_one_weak_types(x_dtype, val_dtype, sycl_dev)
+
+    res_dt = x.dtype
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if not _can_cast(val_dtype, res_dt, _fp16, _fp64):
+        raise ValueError(
+            f"function 'clip' does not support input types "
+            f"({x_dtype}, {val_dtype}), "
+            "and the inputs could not be safely coerced to any "
+            "supported types according to the casting rule ''safe''."
+        )
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+
+        if out.shape != res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {res_shape}, got {out.shape}"
+            )
+
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+
+        if ti._array_overlap(x, out):
+            if not ti._same_logical_tensors(x, out):
+                out = dpt.empty_like(out)
+
+        if isinstance(val, dpt.usm_ndarray):
+            if (
+                ti._array_overlap(val, out)
+                and not ti._same_logical_tensors(val, out)
+                and val_dtype == res_dt
+            ):
+                out = dpt.empty_like(out)
+
+    if isinstance(val, dpt.usm_ndarray):
+        val_ary = val
+    else:
+        val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
+
+    if order == "A":
+        order = (
+            "F"
+            if all(
+                arr.flags.f_contiguous
+                for arr in (
+                    x,
+                    val_ary,
+                )
+            )
+            else "C"
+        )
+    if val_dtype == res_dt:
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x, val_ary, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+        if x_shape != res_shape:
+            x = dpt.broadcast_to(x, res_shape)
+        if val_ary.shape != res_shape:
+            val_ary = dpt.broadcast_to(val_ary, res_shape)
+        _manager = SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        ht_binary_ev, binary_ev = _binary_fn(
+            src1=x, src2=val_ary, dst=out, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_binary_ev, binary_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[binary_ev],
+            )
+            _manager.add_event_pair(ht_copy_out_ev, copy_ev)
+            out = orig_out
+        return out
+    else:
+        if order == "K":
+            buf = _empty_like_orderK(val_ary, res_dt)
+        else:
+            buf = dpt.empty_like(val_ary, dtype=res_dt, order=order)
+        _manager = SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=val_ary, dst=buf, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x, buf, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        if x_shape != res_shape:
+            x = dpt.broadcast_to(x, res_shape)
+        buf = dpt.broadcast_to(buf, res_shape)
+        ht_binary_ev, binary_ev = _binary_fn(
+            src1=x,
+            src2=buf,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_binary_ev, binary_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[binary_ev],
+            )
+            _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+            out = orig_out
+        return out
+
+
+def clip(x, /, min=None, max=None, out=None, order="K"):
+    """clip(x, min=None, max=None, out=None, order="K")
+
+    Clips to the range [`min_i`, `max_i`] for each element `x_i`
+    in `x`.
+
+    Args:
+        x (usm_ndarray): Array containing elements to clip.
+            Must be compatible with `min` and `max` according
+            to broadcasting rules.
+        min ({None, Union[usm_ndarray, bool, int, float, complex]}, optional):
+            Array containing minimum values.
+            Must be compatible with `x` and `max` according
+            to broadcasting rules.
+        max ({None, Union[usm_ndarray, bool, int, float, complex]}, optional):
+            Array containing maximum values.
+            Must be compatible with `x` and `min` according
+            to broadcasting rules.
+        out ({None, usm_ndarray}, optional):
+            Output array to populate.
+            Array must have the correct shape and the expected data type.
+        order ("C","F","A","K", optional):
+            Memory layout of the newly output array, if parameter `out` is
+            `None`.
+            Default: "K".
+
+    Returns:
+        usm_ndarray:
+            An array with elements clipped to the range [`min`, `max`].
+            The returned array has the same data type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected `x` to be of dpnp.tensor.usm_ndarray type, got "
+            f"{type(x)}"
+        )
+    if order not in ["K", "C", "F", "A"]:
+        order = "K"
+    if x.dtype.kind in "iu":
+        if isinstance(min, int) and min <= dpt.iinfo(x.dtype).min:
+            min = None
+        if isinstance(max, int) and max >= dpt.iinfo(x.dtype).max:
+            max = None
+    if min is None and max is None:
+        exec_q = x.sycl_queue
+        orig_out = out
+        if out is not None:
+            if not isinstance(out, dpt.usm_ndarray):
+                raise TypeError(
+                    "output array must be of usm_ndarray type, got "
+                    f"{type(out)}"
+                )
+
+            if not out.flags.writable:
+                raise ValueError("provided `out` array is read-only")
+
+            if out.shape != x.shape:
+                raise ValueError(
+                    "The shape of input and output arrays are "
+                    f"inconsistent. Expected output shape is {x.shape}, "
+                    f"got {out.shape}"
+                )
+
+            if x.dtype != out.dtype:
+                raise ValueError(
+                    f"Output array of type {x.dtype} is needed, "
+                    f"got {out.dtype}"
+                )
+
+            if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+                raise dpt.ExecutionPlacementError(
+                    "Input and output allocation queues are not compatible"
+                )
+
+            if ti._array_overlap(x, out):
+                if not ti._same_logical_tensors(x, out):
+                    out = dpt.empty_like(out)
+                else:
+                    return out
+        else:
+            if order == "K":
+                out = _empty_like_orderK(x, x.dtype)
+            else:
+                out = dpt.empty_like(x, order=order)
+
+        _manager = SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=out, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_copy_ev, cpy_ev)
+            out = orig_out
+        return out
+    elif max is None:
+        return _clip_none(x, min, out, order, tei._maximum)
+    elif min is None:
+        return _clip_none(x, max, out, order, tei._minimum)
+    else:
+        q1, x_usm_type = x.sycl_queue, x.usm_type
+        q2, min_usm_type = _get_queue_usm_type(min)
+        q3, max_usm_type = _get_queue_usm_type(max)
+        if q2 is None and q3 is None:
+            exec_q = q1
+            res_usm_type = x_usm_type
+        elif q3 is None:
+            exec_q = dpt.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise dpt.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpt.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    min_usm_type,
+                )
+            )
+        elif q2 is None:
+            exec_q = dpt.get_execution_queue((q1, q3))
+            if exec_q is None:
+                raise dpt.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpt.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    max_usm_type,
+                )
+            )
+        else:
+            exec_q = dpt.get_execution_queue((q1, q2, q3))
+            if exec_q is None:
+                raise dpt.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpt.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    min_usm_type,
+                    max_usm_type,
+                )
+            )
+        dpt.validate_usm_type(res_usm_type, allow_none=False)
+        x_shape = x.shape
+        min_shape = _get_shape(min)
+        max_shape = _get_shape(max)
+        if not all(
+            isinstance(s, (tuple, list))
+            for s in (
+                min_shape,
+                max_shape,
+            )
+        ):
+            raise TypeError(
+                "Shape of arguments can not be inferred. "
+                "Arguments are expected to be "
+                "lists, tuples, or both"
+            )
+        try:
+            res_shape = _broadcast_shape_impl(
+                [
+                    x_shape,
+                    min_shape,
+                    max_shape,
+                ]
+            )
+        except ValueError:
+            raise ValueError(
+                "operands could not be broadcast together with shapes "
+                f"{x_shape}, {min_shape}, and {max_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        x_dtype = x.dtype
+        min_dtype = _get_dtype(min, sycl_dev)
+        max_dtype = _get_dtype(max, sycl_dev)
+        if not all(_validate_dtype(o) for o in (min_dtype, max_dtype)):
+            raise ValueError("Operands have unsupported data types")
+
+        min_dtype, max_dtype = _resolve_one_strong_two_weak_types(
+            x_dtype, min_dtype, max_dtype, sycl_dev
+        )
+
+        buf1_dt, buf2_dt, res_dt = _check_clip_dtypes(
+            x_dtype,
+            min_dtype,
+            max_dtype,
+            sycl_dev,
+        )
+
+        if res_dt is None:
+            raise ValueError(
+                f"function '{clip}' does not support input types "
+                f"({x_dtype}, {min_dtype}, {max_dtype}), "
+                "and the inputs could not be safely coerced to any "
+                "supported types according to the casting rule ''safe''."
+            )
+
+        orig_out = out
+        if out is not None:
+            if not isinstance(out, dpt.usm_ndarray):
+                raise TypeError(
+                    "output array must be of usm_ndarray type, got "
+                    f"{type(out)}"
+                )
+
+            if not out.flags.writable:
+                raise ValueError("provided `out` array is read-only")
+
+            if out.shape != res_shape:
+                raise ValueError(
+                    "The shape of input and output arrays are "
+                    f"inconsistent. Expected output shape is {res_shape}, "
+                    f"got {out.shape}"
+                )
+
+            if res_dt != out.dtype:
+                raise ValueError(
+                    f"Output array of type {res_dt} is needed, "
+                    f"got {out.dtype}"
+                )
+
+            if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+                raise dpt.ExecutionPlacementError(
+                    "Input and output allocation queues are not compatible"
+                )
+
+            if ti._array_overlap(x, out):
+                if not ti._same_logical_tensors(x, out):
+                    out = dpt.empty_like(out)
+
+            if isinstance(min, dpt.usm_ndarray):
+                if (
+                    ti._array_overlap(min, out)
+                    and not ti._same_logical_tensors(min, out)
+                    and buf1_dt is None
+                ):
+                    out = dpt.empty_like(out)
+
+            if isinstance(max, dpt.usm_ndarray):
+                if (
+                    ti._array_overlap(max, out)
+                    and not ti._same_logical_tensors(max, out)
+                    and buf2_dt is None
+                ):
+                    out = dpt.empty_like(out)
+
+        if isinstance(min, dpt.usm_ndarray):
+            a_min = min
+        else:
+            a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
+        if isinstance(max, dpt.usm_ndarray):
+            a_max = max
+        else:
+            a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
+
+        if order == "A":
+            order = (
+                "F"
+                if all(
+                    arr.flags.f_contiguous
+                    for arr in (
+                        x,
+                        a_min,
+                        a_max,
+                    )
+                )
+                else "C"
+            )
+        if buf1_dt is None and buf2_dt is None:
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        a_min,
+                        a_max,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+            if x_shape != res_shape:
+                x = dpt.broadcast_to(x, res_shape)
+            if a_min.shape != res_shape:
+                a_min = dpt.broadcast_to(a_min, res_shape)
+            if a_max.shape != res_shape:
+                a_max = dpt.broadcast_to(a_max, res_shape)
+            _manager = SequentialOrderManager[exec_q]
+            dep_ev = _manager.submitted_events
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x,
+                min=a_min,
+                max=a_max,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=dep_ev,
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+
+        elif buf1_dt is None:
+            if order == "K":
+                buf2 = _empty_like_orderK(a_max, buf2_dt)
+            else:
+                buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+            _manager = SequentialOrderManager[exec_q]
+            dep_ev = _manager.submitted_events
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_ev
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        a_min,
+                        buf2,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            x = dpt.broadcast_to(x, res_shape)
+            if a_min.shape != res_shape:
+                a_min = dpt.broadcast_to(a_min, res_shape)
+            buf2 = dpt.broadcast_to(buf2, res_shape)
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x,
+                min=a_min,
+                max=buf2,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+
+        elif buf2_dt is None:
+            if order == "K":
+                buf1 = _empty_like_orderK(a_min, buf1_dt)
+            else:
+                buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+            _manager = SequentialOrderManager[exec_q]
+            dep_ev = _manager.submitted_events
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_ev
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+            if out is None:
+                if order == "K":
+                    out = _empty_like_triple_orderK(
+                        x,
+                        buf1,
+                        a_max,
+                        res_dt,
+                        res_shape,
+                        res_usm_type,
+                        exec_q,
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            x = dpt.broadcast_to(x, res_shape)
+            buf1 = dpt.broadcast_to(buf1, res_shape)
+            if a_max.shape != res_shape:
+                a_max = dpt.broadcast_to(a_max, res_shape)
+            ht_binary_ev, binary_ev = ti._clip(
+                src=x,
+                min=buf1,
+                max=a_max,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+
+        if order == "K":
+            if (
+                x.flags.c_contiguous
+                and a_min.flags.c_contiguous
+                and a_max.flags.c_contiguous
+            ):
+                order = "C"
+            elif (
+                x.flags.f_contiguous
+                and a_min.flags.f_contiguous
+                and a_max.flags.f_contiguous
+            ):
+                order = "F"
+        if order == "K":
+            buf1 = _empty_like_orderK(a_min, buf1_dt)
+        else:
+            buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
+
+        _manager = SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+        if order == "K":
+            buf2 = _empty_like_orderK(a_max, buf2_dt)
+        else:
+            buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
+        ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_triple_orderK(
+                    x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        x = dpt.broadcast_to(x, res_shape)
+        buf1 = dpt.broadcast_to(buf1, res_shape)
+        buf2 = dpt.broadcast_to(buf2, res_shape)
+        ht_, clip_ev = ti._clip(
+            src=x,
+            min=buf1,
+            max=buf2,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy1_ev, copy2_ev],
+        )
+        _manager.add_event_pair(ht_, clip_ev)
+        return out
diff --git a/dpnp/tensor/_compute_follows_data.pyx b/dpnp/tensor/_compute_follows_data.pyx
new file mode 100644
index 000000000000..70e6bdfaeb79
--- /dev/null
+++ b/dpnp/tensor/_compute_follows_data.pyx
@@ -0,0 +1,191 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+"""Compute-follows-data utilities for execution queue and USM type management.
+
+This module provides utilities to determine execution placement and USM allocation
+types when combining arrays under the compute-follows-data paradigm.
+"""
+
+
+import dpctl
+from dpctl._sycl_queue cimport SyclQueue
+
+__all__ = [
+    "get_execution_queue", "get_coerced_usm_type", "ExecutionPlacementError"
+]
+
+
+class ExecutionPlacementError(Exception):
+    """Exception raised when execution placement target can not
+    be unambiguously determined from input arrays.
+
+    Make sure that input arrays are associated with the same
+    :class:`dpctl.SyclQueue`,
+    or migrate data to the same :class:`dpctl.SyclQueue` using
+    :meth:`dpctl.tensor.usm_ndarray.to_device` method.
+    """
+    pass
+
+
+cdef bint queue_equiv(SyclQueue q1, SyclQueue q2):
+    """Queues are equivalent if ``q1 == q2``, that is they are copies
+    of the same underlying SYCL object and hence are the same."""
+    return q1.__eq__(q2)
+
+
+def get_execution_queue(qs, /):
+    """
+    Get execution queue from queues associated with input arrays.
+
+    Args:
+        qs (List[:class:`dpctl.SyclQueue`], Tuple[:class:`dpctl.SyclQueue`]):
+            a list or a tuple of :class:`dpctl.SyclQueue` objects
+            corresponding to arrays that are being combined.
+
+    Returns:
+        SyclQueue:
+            execution queue under compute follows data paradigm,
+            or ``None`` if queues are not equal.
+    """
+    if not isinstance(qs, (list, tuple)):
+        raise TypeError(
+            "Expected a list or a tuple, got {}".format(type(qs))
+        )
+    if len(qs) == 0:
+        return None
+    elif len(qs) == 1:
+        return qs[0] if isinstance(qs[0], dpctl.SyclQueue) else None
+    for q1, q2 in zip(qs[:-1], qs[1:]):
+        if not isinstance(q1, dpctl.SyclQueue):
+            return None
+        elif not isinstance(q2, dpctl.SyclQueue):
+            return None
+        elif not queue_equiv(<SyclQueue> q1, <SyclQueue> q2):
+            return None
+    return qs[0]
+
+
+def get_coerced_usm_type(usm_types, /):
+    """
+    Get USM type of the output array for a function combining
+    arrays of given USM types using compute-follows-data execution
+    model.
+
+    Args:
+        usm_types (List[str], Tuple[str]):
+            a list or a tuple of strings of ``.usm_types`` attributes
+            for input arrays
+
+    Returns:
+         str
+            type of USM allocation for the output arrays (s).
+            ``None`` if any of the input strings are not recognized.
+    """
+    if not isinstance(usm_types, (list, tuple)):
+        raise TypeError(
+            "Expected a list or a tuple, got {}".format(type(usm_types))
+        )
+    if len(usm_types) == 0:
+        return None
+    _k = ["device", "shared", "host"]
+    _m = {k: i for i, k in enumerate(_k)}
+    res = len(_k)
+    for t in usm_types:
+        if not isinstance(t, str):
+            return None
+        if t not in _m:
+            return None
+        res = min(res, _m[t])
+    return _k[res]
+
+
+def _validate_usm_type_allow_none(usm_type):
+    "Validates usm_type argument"
+    if usm_type is not None:
+        if isinstance(usm_type, str):
+            if usm_type not in ["device", "shared", "host"]:
+                raise ValueError(
+                    f"Unrecognized value of usm_type={usm_type}, "
+                    "expected 'device', 'shared', 'host', or None."
+                )
+        else:
+            raise TypeError(
+                f"Expected usm_type to be a str or None, got {type(usm_type)}"
+            )
+
+
+def _validate_usm_type_disallow_none(usm_type):
+    "Validates usm_type argument"
+    if isinstance(usm_type, str):
+        if usm_type not in ["device", "shared", "host"]:
+            raise ValueError(
+                f"Unrecognized value of usm_type={usm_type}, "
+                "expected 'device', 'shared', or 'host'."
+            )
+    else:
+        raise TypeError(
+            f"Expected usm_type to be a str, got {type(usm_type)}"
+        )
+
+
+def validate_usm_type(usm_type, /, *, allow_none=True):
+    """ validate_usm_type(usm_type, allow_none=True)
+
+    Raises an exception if `usm_type` is invalid.
+
+    Args:
+        usm_type:
+            Specification for USM allocation type. Valid specifications
+            are:
+
+            * ``"device"``
+            * ``"shared"``
+            * ``"host"``
+
+            If ``allow_none`` keyword argument is set, a value of
+            ``None`` is also permitted.
+        allow_none (bool, optional):
+            Whether ``usm_type`` value of ``None`` is considered valid.
+            Default: `True`.
+
+    Raises:
+        ValueError:
+            if ``usm_type`` is not a recognized string.
+        TypeError:
+            if ``usm_type`` is not a string, and ``usm_type`` is
+            not ``None`` provided ``allow_none`` is ``True``.
+    """
+    if allow_none:
+        _validate_usm_type_allow_none(usm_type)
+    else:
+        _validate_usm_type_disallow_none(usm_type)
diff --git a/dpnp/tensor/_constants.py b/dpnp/tensor/_constants.py
new file mode 100644
index 000000000000..4c134bd9d375
--- /dev/null
+++ b/dpnp/tensor/_constants.py
@@ -0,0 +1,36 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+
+newaxis = None
+
+pi = np.pi
+e = np.e
+nan = np.nan
+inf = np.inf
diff --git a/dpnp/tensor/_copy_utils.py b/dpnp/tensor/_copy_utils.py
new file mode 100644
index 000000000000..3978e7345b12
--- /dev/null
+++ b/dpnp/tensor/_copy_utils.py
@@ -0,0 +1,1160 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import builtins
+import operator
+from numbers import Integral
+
+import dpctl
+import dpctl.memory as dpm
+import numpy as np
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+
+from ._data_types import _get_dtype
+from ._device import normalize_queue_device
+from ._numpy_helper import normalize_axis_index
+from ._type_utils import _dtype_supported_by_device_impl
+
+__doc__ = (
+    "Implementation module for copy- and cast- operations on "
+    ":class:`dpctl.tensor.usm_ndarray`."
+)
+
+int32_t_max = 1 + np.iinfo(np.int32).max
+
+
+def _copy_to_numpy(ary):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(ary)}")
+    if ary.size == 0:
+        # no data needs to be copied for zero sized array
+        return np.ndarray(ary.shape, dtype=ary.dtype)
+    nb = ary.usm_data.nbytes
+    q = ary.sycl_queue
+    hh = dpm.MemoryUSMHost(nb, queue=q)
+    h = np.ndarray(nb, dtype="u1", buffer=hh).view(ary.dtype)
+    itsz = ary.itemsize
+    strides_bytes = tuple(si * itsz for si in ary.strides)
+    offset = ary._element_offset * itsz
+    # ensure that content of ary.usm_data is final
+    q.wait()
+    hh.copy_from_device(ary.usm_data)
+    return np.ndarray(
+        ary.shape,
+        dtype=ary.dtype,
+        buffer=h,
+        strides=strides_bytes,
+        offset=offset,
+    )
+
+
+def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None):
+    """Copies numpy array `np_ary` into a new usm_ndarray"""
+    # This may perform a copy to meet stated requirements
+    Xnp = np.require(np_ary, requirements=["A", "E"])
+    alloc_q = normalize_queue_device(sycl_queue=sycl_queue, device=None)
+    dt = Xnp.dtype
+    if dt.char in "dD" and alloc_q.sycl_device.has_aspect_fp64 is False:
+        Xusm_dtype = (
+            dpt.dtype("float32") if dt.char == "d" else dpt.dtype("complex64")
+        )
+    else:
+        Xusm_dtype = dt
+    Xusm = dpt.empty(
+        Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue
+    )
+    _copy_from_numpy_into(Xusm, Xnp)
+    return Xusm
+
+
+def _copy_from_numpy_into(dst, np_ary):
+    """Copies `np_ary` into `dst` of type :class:`dpctl.tensor.usm_ndarray"""
+    if not isinstance(np_ary, np.ndarray):
+        raise TypeError(f"Expected numpy.ndarray, got {type(np_ary)}")
+    if not isinstance(dst, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(dst)}")
+    if np_ary.flags["OWNDATA"]:
+        Xnp = np_ary
+    else:
+        # Determine base of input array
+        base = np_ary.base
+        while isinstance(base, np.ndarray):
+            base = base.base
+        if isinstance(base, dpm._memory._Memory):
+            # we must perform a copy, since subsequent
+            # _copy_numpy_ndarray_into_usm_ndarray is implemented using
+            # sycl::buffer, and using USM-pointers with sycl::buffer
+            # results is undefined behavior
+            Xnp = np_ary.copy()
+        else:
+            Xnp = np_ary
+    src_ary = np.broadcast_to(Xnp, dst.shape)
+    copy_q = dst.sycl_queue
+    if copy_q.sycl_device.has_aspect_fp64 is False:
+        src_ary_dt_c = src_ary.dtype.char
+        if src_ary_dt_c == "d":
+            src_ary = src_ary.astype(np.float32)
+        elif src_ary_dt_c == "D":
+            src_ary = src_ary.astype(np.complex64)
+    _manager = SequentialOrderManager[copy_q]
+    dep_ev = _manager.submitted_events
+    # synchronizing call
+    ti._copy_numpy_ndarray_into_usm_ndarray(
+        src=src_ary, dst=dst, sycl_queue=copy_q, depends=dep_ev
+    )
+
+
+def _extract_impl(ary, ary_mask, axis=0):
+    """
+    Extract elements of ary by applying mask starting from slot
+    dimension axis
+    """
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}"
+        )
+    if isinstance(ary_mask, dpt.usm_ndarray):
+        dst_usm_type = dpt.get_coerced_usm_type(
+            (ary.usm_type, ary_mask.usm_type)
+        )
+        exec_q = dpt.get_execution_queue((ary.sycl_queue, ary_mask.sycl_queue))
+        if exec_q is None:
+            raise dpt.ExecutionPlacementError(
+                "arrays have different associated queues. "
+                "Use `y.to_device(x.device)` to migrate."
+            )
+    elif isinstance(ary_mask, np.ndarray):
+        dst_usm_type = ary.usm_type
+        exec_q = ary.sycl_queue
+        ary_mask = dpt.asarray(
+            ary_mask, usm_type=dst_usm_type, sycl_queue=exec_q
+        )
+    else:
+        raise TypeError(
+            "Expecting type dpnp.tensor.usm_ndarray or numpy.ndarray, got "
+            f"{type(ary_mask)}"
+        )
+    ary_nd = ary.ndim
+    pp = normalize_axis_index(operator.index(axis), ary_nd)
+    mask_nd = ary_mask.ndim
+    if pp < 0 or pp + mask_nd > ary_nd:
+        raise ValueError(
+            "Parameter p is inconsistent with input array dimensions"
+        )
+    mask_nelems = ary_mask.size
+    cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
+    cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device)
+    exec_q = cumsum.sycl_queue
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    mask_count = ti.mask_positions(
+        ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs
+    )
+    dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
+    dst = dpt.empty(
+        dst_shape, dtype=ary.dtype, usm_type=dst_usm_type, device=ary.device
+    )
+    if dst.size == 0:
+        return dst
+    hev, ev = ti._extract(
+        src=ary,
+        cumsum=cumsum,
+        axis_start=pp,
+        axis_end=pp + mask_nd,
+        dst=dst,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev, ev)
+    return dst
+
+
+def _get_indices_queue_usm_type(inds, queue, usm_type):
+    """
+    Utility for validating indices are NumPy ndarray or usm_ndarray of integral
+    dtype or Python integers. At least one must be an array.
+
+    For each array, the queue and usm type are appended to `queue_list` and
+    `usm_type_list`, respectively.
+    """
+    queues = [queue]
+    usm_types = [usm_type]
+    any_array = False
+    for ind in inds:
+        if isinstance(ind, (np.ndarray, dpt.usm_ndarray)):
+            any_array = True
+            if ind.dtype.kind not in "ui":
+                raise IndexError(
+                    "arrays used as indices must be of integer (or boolean) "
+                    "type"
+                )
+            if isinstance(ind, dpt.usm_ndarray):
+                queues.append(ind.sycl_queue)
+                usm_types.append(ind.usm_type)
+        elif not isinstance(ind, Integral):
+            raise TypeError(
+                "all elements of `ind` expected to be usm_ndarrays, "
+                f"NumPy arrays, or integers, found {type(ind)}"
+            )
+    if not any_array:
+        raise TypeError(
+            "at least one element of `inds` expected to be an array"
+        )
+    usm_type = dpt.get_coerced_usm_type(usm_types)
+    q = dpt.get_execution_queue(queues)
+    return q, usm_type
+
+
+def _nonzero_impl(ary):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}"
+        )
+    exec_q = ary.sycl_queue
+    usm_type = ary.usm_type
+    mask_nelems = ary.size
+    cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
+    cumsum = dpt.empty(
+        mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C"
+    )
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    mask_count = ti.mask_positions(
+        ary, cumsum, sycl_queue=exec_q, depends=dep_evs
+    )
+    indexes_dt = ti.default_device_index_type(exec_q.sycl_device)
+    indexes = dpt.empty(
+        (ary.ndim, mask_count),
+        dtype=indexes_dt,
+        usm_type=usm_type,
+        sycl_queue=exec_q,
+        order="C",
+    )
+    hev, nz_ev = ti._nonzero(cumsum, indexes, ary.shape, exec_q)
+    res = tuple(indexes[i, :] for i in range(ary.ndim))
+    _manager.add_event_pair(hev, nz_ev)
+    return res
+
+
+def _prepare_indices_arrays(inds, q, usm_type):
+    """
+    Utility taking a mix of usm_ndarray and possibly Python int scalar indices,
+    a queue (assumed to be common to arrays in inds), and a usm type.
+
+    Python scalar integers are promoted to arrays on the provided queue and
+    with the provided usm type. All arrays are then promoted to a common
+    integral type (if possible) before being broadcast to a common shape.
+    """
+    # scalar integers -> arrays
+    inds = tuple(
+        map(
+            lambda ind: (
+                ind
+                if isinstance(ind, dpt.usm_ndarray)
+                else dpt.asarray(ind, usm_type=usm_type, sycl_queue=q)
+            ),
+            inds,
+        )
+    )
+
+    # promote to a common integral type if possible
+    ind_dt = dpt.result_type(*inds)
+    if ind_dt.kind not in "ui":
+        raise ValueError(
+            "cannot safely promote indices to an integer data type"
+        )
+    inds = tuple(
+        map(
+            lambda ind: (
+                ind if ind.dtype == ind_dt else dpt.astype(ind, ind_dt)
+            ),
+            inds,
+        )
+    )
+
+    # broadcast
+    inds = dpt.broadcast_arrays(*inds)
+
+    return inds
+
+
+def _place_impl(ary, ary_mask, vals, axis=0):
+    """
+    Extract elements of ary by applying mask starting from slot
+    dimension axis.
+    """
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}"
+        )
+    if isinstance(ary_mask, dpt.usm_ndarray):
+        exec_q = dpt.get_execution_queue(
+            (
+                ary.sycl_queue,
+                ary_mask.sycl_queue,
+            )
+        )
+        coerced_usm_type = dpt.get_coerced_usm_type(
+            (
+                ary.usm_type,
+                ary_mask.usm_type,
+            )
+        )
+        if exec_q is None:
+            raise dpt.ExecutionPlacementError(
+                "arrays have different associated queues. "
+                "Use `y.to_device(x.device)` to migrate."
+            )
+    elif isinstance(ary_mask, np.ndarray):
+        exec_q = ary.sycl_queue
+        coerced_usm_type = ary.usm_type
+        ary_mask = dpt.asarray(
+            ary_mask, usm_type=coerced_usm_type, sycl_queue=exec_q
+        )
+    else:
+        raise TypeError(
+            "Expecting type dpnp.tensor.usm_ndarray or numpy.ndarray, got "
+            f"{type(ary_mask)}"
+        )
+    if exec_q is not None:
+        if not isinstance(vals, dpt.usm_ndarray):
+            vals = dpt.asarray(
+                vals,
+                dtype=ary.dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        else:
+            exec_q = dpt.get_execution_queue((exec_q, vals.sycl_queue))
+            coerced_usm_type = dpt.get_coerced_usm_type(
+                (
+                    coerced_usm_type,
+                    vals.usm_type,
+                )
+            )
+    if exec_q is None:
+        raise dpt.ExecutionPlacementError(
+            "arrays have different associated queues. "
+            "Use `Y.to_device(X.device)` to migrate."
+        )
+    ary_nd = ary.ndim
+    pp = normalize_axis_index(operator.index(axis), ary_nd)
+    mask_nd = ary_mask.ndim
+    if pp < 0 or pp + mask_nd > ary_nd:
+        raise ValueError(
+            "Parameter p is inconsistent with input array dimensions"
+        )
+    mask_nelems = ary_mask.size
+    cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
+    cumsum = dpt.empty(
+        mask_nelems,
+        dtype=cumsum_dt,
+        usm_type=coerced_usm_type,
+        device=ary_mask.device,
+    )
+    exec_q = cumsum.sycl_queue
+    _manager = SequentialOrderManager[exec_q]
+    dep_ev = _manager.submitted_events
+    mask_count = ti.mask_positions(
+        ary_mask, cumsum, sycl_queue=exec_q, depends=dep_ev
+    )
+    expected_vals_shape = (
+        ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
+    )
+    if vals.dtype == ary.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, ary.dtype)
+    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
+    if mask_nelems == 0:
+        return
+    dep_ev = _manager.submitted_events
+    hev, pl_ev = ti._place(
+        dst=ary,
+        cumsum=cumsum,
+        axis_start=pp,
+        axis_end=pp + mask_nd,
+        rhs=rhs,
+        sycl_queue=exec_q,
+        depends=dep_ev,
+    )
+    _manager.add_event_pair(hev, pl_ev)
+    return
+
+
+def _put_multi_index(ary, inds, p, vals, mode=0):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}"
+        )
+    ary_nd = ary.ndim
+    p = normalize_axis_index(operator.index(p), ary_nd)
+    mode = operator.index(mode)
+    if mode not in [0, 1]:
+        raise ValueError(
+            "Invalid value for mode keyword, only 0 or 1 is supported"
+        )
+    if not isinstance(inds, (list, tuple)):
+        inds = (inds,)
+
+    exec_q, coerced_usm_type = _get_indices_queue_usm_type(
+        inds, ary.sycl_queue, ary.usm_type
+    )
+
+    if exec_q is not None:
+        if not isinstance(vals, dpt.usm_ndarray):
+            vals = dpt.asarray(
+                vals,
+                dtype=ary.dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        else:
+            exec_q = dpt.get_execution_queue((exec_q, vals.sycl_queue))
+            coerced_usm_type = dpt.get_coerced_usm_type(
+                (
+                    coerced_usm_type,
+                    vals.usm_type,
+                )
+            )
+    if exec_q is None:
+        raise dpt.ExecutionPlacementError(
+            "Can not automatically determine where to allocate the "
+            "result or performance execution. "
+            "Use `usm_ndarray.to_device` method to migrate data to "
+            "be associated with the same queue."
+        )
+
+    inds = _prepare_indices_arrays(inds, exec_q, coerced_usm_type)
+
+    ind0 = inds[0]
+    ary_sh = ary.shape
+    p_end = p + len(inds)
+    if 0 in ary_sh[p:p_end] and ind0.size != 0:
+        raise IndexError(
+            "cannot put into non-empty indices along an empty axis"
+        )
+    expected_vals_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:]
+    if vals.dtype == ary.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, ary.dtype)
+    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
+    _manager = SequentialOrderManager[exec_q]
+    dep_ev = _manager.submitted_events
+    hev, put_ev = ti._put(
+        dst=ary,
+        ind=inds,
+        val=rhs,
+        axis_start=p,
+        mode=mode,
+        sycl_queue=exec_q,
+        depends=dep_ev,
+    )
+    _manager.add_event_pair(hev, put_ev)
+    return
+
+
+def _take_multi_index(ary, inds, p, mode=0):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpnp.tensor.usm_ndarray, got {type(ary)}"
+        )
+    ary_nd = ary.ndim
+    p = normalize_axis_index(operator.index(p), ary_nd)
+    mode = operator.index(mode)
+    if mode not in [0, 1]:
+        raise ValueError(
+            "Invalid value for mode keyword, only 0 or 1 is supported"
+        )
+    if not isinstance(inds, (list, tuple)):
+        inds = (inds,)
+
+    exec_q, res_usm_type = _get_indices_queue_usm_type(
+        inds, ary.sycl_queue, ary.usm_type
+    )
+    if exec_q is None:
+        raise dpt.ExecutionPlacementError(
+            "Can not automatically determine where to allocate the "
+            "result or performance execution. "
+            "Use `usm_ndarray.to_device` method to migrate data to "
+            "be associated with the same queue."
+        )
+
+    inds = _prepare_indices_arrays(inds, exec_q, res_usm_type)
+
+    ind0 = inds[0]
+    ary_sh = ary.shape
+    p_end = p + len(inds)
+    if 0 in ary_sh[p:p_end] and ind0.size != 0:
+        raise IndexError("cannot take non-empty indices from an empty axis")
+    res_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:]
+    res = dpt.empty(
+        res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+    _manager = SequentialOrderManager[exec_q]
+    dep_ev = _manager.submitted_events
+    hev, take_ev = ti._take(
+        src=ary,
+        ind=inds,
+        dst=res,
+        axis_start=p,
+        mode=mode,
+        sycl_queue=exec_q,
+        depends=dep_ev,
+    )
+    _manager.add_event_pair(hev, take_ev)
+    return res
+
+
+def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None):
+    """
+    from_numpy(arg, device=None, usm_type="device", sycl_queue=None)
+
+    Creates :class:`dpctl.tensor.usm_ndarray` from instance of
+    :class:`numpy.ndarray`.
+
+    Args:
+        arg:
+            Input convertible to :class:`numpy.ndarray`
+        device (object): array API specification of device where the
+            output array is created. Device can be specified by
+            a filter selector string, an instance of
+            :class:`dpctl.SyclDevice`, an instance of
+            :class:`dpctl.SyclQueue`, or an instance of
+            :class:`dpctl.tensor.Device`. If the value is ``None``,
+            returned array is created on the default-selected device.
+            Default: ``None``
+        usm_type (str): The requested USM allocation type for the
+            output array. Recognized values are ``"device"``,
+            ``"shared"``, or ``"host"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            A SYCL queue that determines output array allocation device
+            as well as execution placement of data movement operations.
+            The ``device`` and ``sycl_queue`` arguments
+            are equivalent. Only one of them should be specified. If both
+            are provided, they must be consistent and result in using the
+            same execution queue. Default: ``None``
+
+    The returned array has the same shape, and the same data type kind.
+    If the device does not support the data type of input array, a
+    closest support data type of the same kind may be returned, e.g.
+    input array of type ``float16`` may be upcast to ``float32`` if the
+    target device does not support 16-bit floating point type.
+    """
+    q = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    return _copy_from_numpy(np_ary, usm_type=usm_type, sycl_queue=q)
+
+
+def to_numpy(usm_ary, /):
+    """
+    to_numpy(usm_ary)
+
+    Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary``
+    into :class:`numpy.ndarray` instance of the same shape and same data type.
+
+    Args:
+        usm_ary (usm_ndarray):
+            Input array
+    Returns:
+        :class:`numpy.ndarray`:
+            An instance of :class:`numpy.ndarray` populated with content of
+            ``usm_ary``
+    """
+    return _copy_to_numpy(usm_ary)
+
+
+def asnumpy(usm_ary):
+    """
+    asnumpy(usm_ary)
+
+    Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary``
+    into :class:`numpy.ndarray` instance of the same shape and same data
+    type.
+
+    Args:
+        usm_ary (usm_ndarray):
+            Input array
+    Returns:
+        :class:`numpy.ndarray`:
+            An instance of :class:`numpy.ndarray` populated with content
+            of ``usm_ary``
+    """
+    return _copy_to_numpy(usm_ary)
+
+
+class Dummy:
+    """Helper class with specified ``__sycl_usm_array_interface__`` attribute"""
+
+    def __init__(self, iface):
+        self.__sycl_usm_array_interface__ = iface
+
+
+def _copy_overlapping(dst, src):
+    """Assumes src and dst have the same shape."""
+    q = normalize_queue_device(sycl_queue=dst.sycl_queue)
+    tmp = dpt.usm_ndarray(
+        src.shape,
+        dtype=src.dtype,
+        buffer="device",
+        order="C",
+        buffer_ctor_kwargs={"queue": q},
+    )
+    _manager = SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    hcp1, cp1 = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=src, dst=tmp, sycl_queue=q, depends=dep_evs
+    )
+    _manager.add_event_pair(hcp1, cp1)
+    hcp2, cp2 = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=tmp, dst=dst, sycl_queue=q, depends=[cp1]
+    )
+    _manager.add_event_pair(hcp2, cp2)
+
+
+def _copy_same_shape(dst, src):
+    """Assumes src and dst have the same shape."""
+    # check that memory regions do not overlap
+    if ti._array_overlap(dst, src):
+        if src._pointer == dst._pointer and (
+            src is dst
+            or (src.strides == dst.strides and src.dtype == dst.dtype)
+        ):
+            return
+        _copy_overlapping(src=src, dst=dst)
+        return
+
+    copy_q = dst.sycl_queue
+    _manager = SequentialOrderManager[copy_q]
+    dep_evs = _manager.submitted_events
+    hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=src, dst=dst, sycl_queue=copy_q, depends=dep_evs
+    )
+    _manager.add_event_pair(hev, cpy_ev)
+
+
+if hasattr(np, "broadcast_shapes"):
+
+    def _broadcast_shapes(sh1, sh2):
+        return np.broadcast_shapes(sh1, sh2)
+
+else:
+
+    def _broadcast_shapes(sh1, sh2):
+        # use arrays with zero strides, whose memory footprint
+        # is independent of the number of array elements
+        return np.broadcast(
+            np.empty(sh1, dtype=[]),
+            np.empty(sh2, dtype=[]),
+        ).shape
+
+
+def _broadcast_strides(X_shape, X_strides, res_ndim):
+    """
+    Broadcasts strides to match the given dimensions;
+    returns tuple type strides.
+    """
+    out_strides = [0] * res_ndim
+    X_shape_len = len(X_shape)
+    str_dim = -X_shape_len
+    for i in range(X_shape_len):
+        shape_value = X_shape[i]
+        if not shape_value == 1:
+            out_strides[str_dim] = X_strides[i]
+        str_dim += 1
+
+    return tuple(out_strides)
+
+
+def _copy_from_usm_ndarray_to_usm_ndarray(dst, src):
+    if any(
+        not isinstance(arg, dpt.usm_ndarray)
+        for arg in (
+            dst,
+            src,
+        )
+    ):
+        raise TypeError(
+            "Both types are expected to be dpnp.tensor.usm_ndarray, "
+            f"got {type(dst)} and {type(src)}."
+        )
+
+    if dst.ndim == src.ndim and dst.shape == src.shape:
+        _copy_same_shape(dst, src)
+        return
+
+    try:
+        common_shape = _broadcast_shapes(dst.shape, src.shape)
+    except ValueError as exc:
+        raise ValueError("Shapes of two arrays are not compatible") from exc
+
+    if dst.size < src.size and dst.size < np.prod(common_shape):
+        raise ValueError("Destination is smaller ")
+
+    if len(common_shape) > dst.ndim:
+        ones_count = len(common_shape) - dst.ndim
+        for k in range(ones_count):
+            if common_shape[k] != 1:
+                raise ValueError
+        common_shape = common_shape[ones_count:]
+
+    if src.ndim < len(common_shape):
+        new_src_strides = _broadcast_strides(
+            src.shape, src.strides, len(common_shape)
+        )
+        src_same_shape = dpt.usm_ndarray(
+            common_shape,
+            dtype=src.dtype,
+            buffer=src,
+            strides=new_src_strides,
+            offset=src._element_offset,
+        )
+    elif src.ndim == len(common_shape):
+        new_src_strides = _broadcast_strides(
+            src.shape, src.strides, len(common_shape)
+        )
+        src_same_shape = dpt.usm_ndarray(
+            common_shape,
+            dtype=src.dtype,
+            buffer=src,
+            strides=new_src_strides,
+            offset=src._element_offset,
+        )
+    else:
+        # since broadcasting succeeded, src.ndim is greater because of
+        # leading sequence of ones, so we trim it
+        n = len(common_shape)
+        new_src_strides = _broadcast_strides(
+            src.shape[-n:], src.strides[-n:], n
+        )
+        src_same_shape = dpt.usm_ndarray(
+            common_shape,
+            dtype=src.dtype,
+            buffer=src.usm_data,
+            strides=new_src_strides,
+            offset=src._element_offset,
+        )
+
+    _copy_same_shape(dst, src_same_shape)
+
+
+def _make_empty_like_orderK(x, dt, usm_type, dev):
+    """
+    Returns empty array with shape and strides like `x`, with dtype `dt`,
+    USM type `usm_type`, on device `dev`.
+    """
+    st = list(x.strides)
+    perm = sorted(
+        range(x.ndim),
+        key=lambda d: builtins.abs(st[d]) if x.shape[d] > 1 else 0,
+        reverse=True,
+    )
+    inv_perm = sorted(range(x.ndim), key=lambda i: perm[i])
+    sh = x.shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if min(st) < 0:
+        st_sorted = [st[i] for i in perm]
+        sl = tuple(
+            (
+                slice(None, None, -1)
+                if st_sorted[i] < 0
+                else slice(None, None, None)
+            )
+            for i in range(x.ndim)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
+def _empty_like_orderK(x, dt, usm_type=None, dev=None):
+    """
+    Returns empty array like `x`, using order='K'
+
+    For an array `x` that was obtained by permutation of a contiguous
+    array the returned array will have the same shape and the same
+    strides as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(x)}")
+    if usm_type is None:
+        usm_type = x.usm_type
+    if dev is None:
+        dev = x.device
+    fl = x.flags
+    if fl["C"] or x.size <= 1:
+        return dpt.empty_like(
+            x, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    elif fl["F"]:
+        return dpt.empty_like(
+            x, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    return _make_empty_like_orderK(x, dt, usm_type, dev)
+
+
+def _from_numpy_empty_like_orderK(x, dt, usm_type, dev):
+    """
+    Returns empty usm_ndarray like NumPy array `x`, using order='K'
+
+    For an array `x` that was obtained by permutation of a contiguous
+    array the returned array will have the same shape and the same
+    strides as `x`.
+    """
+    if not isinstance(x, np.ndarray):
+        raise TypeError(f"Expected numpy.ndarray, got {type(x)}")
+    fl = x.flags
+    if fl["C"] or x.size <= 1:
+        return dpt.empty(
+            x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    elif fl["F"]:
+        return dpt.empty(
+            x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    return _make_empty_like_orderK(x, dt, usm_type, dev)
+
+
+def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
+    if not isinstance(X1, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X1)}")
+    if not isinstance(X2, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X2)}")
+    nd1 = X1.ndim
+    nd2 = X2.ndim
+    if nd1 > nd2 and X1.shape == res_shape:
+        return _empty_like_orderK(X1, dt, usm_type, dev)
+    elif nd1 < nd2 and X2.shape == res_shape:
+        return _empty_like_orderK(X2, dt, usm_type, dev)
+    fl1 = X1.flags
+    fl2 = X2.flags
+    if fl1["C"] or fl2["C"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    if fl1["F"] and fl2["F"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    st1 = list(X1.strides)
+    st2 = list(X2.strides)
+    max_ndim = max(nd1, nd2)
+    st1 += [0] * (max_ndim - len(st1))
+    st2 += [0] * (max_ndim - len(st2))
+    sh1 = list(X1.shape) + [0] * (max_ndim - nd1)
+    sh2 = list(X2.shape) + [0] * (max_ndim - nd2)
+    perm = sorted(
+        range(max_ndim),
+        key=lambda d: (
+            builtins.abs(st1[d]) if sh1[d] > 1 else 0,
+            builtins.abs(st2[d]) if sh2[d] > 1 else 0,
+        ),
+        reverse=True,
+    )
+    inv_perm = sorted(range(max_ndim), key=lambda i: perm[i])
+    st1_sorted = [st1[i] for i in perm]
+    st2_sorted = [st2[i] for i in perm]
+    sh = res_shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if max(min(st1_sorted), min(st2_sorted)) < 0:
+        sl = tuple(
+            (
+                slice(None, None, -1)
+                if (st1_sorted[i] < 0 and st2_sorted[i] < 0)
+                else slice(None, None, None)
+            )
+            for i in range(nd1)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
+def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
+    if not isinstance(X1, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X1)}")
+    if not isinstance(X2, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X2)}")
+    if not isinstance(X3, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X3)}")
+    nd1 = X1.ndim
+    nd2 = X2.ndim
+    nd3 = X3.ndim
+    if X1.shape == res_shape and X2.shape == res_shape and len(res_shape) > nd3:
+        return _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev)
+    elif (
+        X2.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd1
+    ):
+        return _empty_like_pair_orderK(X2, X3, dt, res_shape, usm_type, dev)
+    elif (
+        X1.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd2
+    ):
+        return _empty_like_pair_orderK(X1, X3, dt, res_shape, usm_type, dev)
+    fl1 = X1.flags
+    fl2 = X2.flags
+    fl3 = X3.flags
+    if fl1["C"] or fl2["C"] or fl3["C"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    if fl1["F"] and fl2["F"] and fl3["F"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    st1 = list(X1.strides)
+    st2 = list(X2.strides)
+    st3 = list(X3.strides)
+    max_ndim = max(nd1, nd2, nd3)
+    st1 += [0] * (max_ndim - len(st1))
+    st2 += [0] * (max_ndim - len(st2))
+    st3 += [0] * (max_ndim - len(st3))
+    sh1 = list(X1.shape) + [0] * (max_ndim - nd1)
+    sh2 = list(X2.shape) + [0] * (max_ndim - nd2)
+    sh3 = list(X3.shape) + [0] * (max_ndim - nd3)
+    perm = sorted(
+        range(max_ndim),
+        key=lambda d: (
+            builtins.abs(st1[d]) if sh1[d] > 1 else 0,
+            builtins.abs(st2[d]) if sh2[d] > 1 else 0,
+            builtins.abs(st3[d]) if sh3[d] > 1 else 0,
+        ),
+        reverse=True,
+    )
+    inv_perm = sorted(range(max_ndim), key=lambda i: perm[i])
+    st1_sorted = [st1[i] for i in perm]
+    st2_sorted = [st2[i] for i in perm]
+    st3_sorted = [st3[i] for i in perm]
+    sh = res_shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0:
+        sl = tuple(
+            (
+                slice(None, None, -1)
+                if (
+                    st1_sorted[i] < 0
+                    and st2_sorted[i] < 0
+                    and st3_sorted[i] < 0
+                )
+                else slice(None, None, None)
+            )
+            for i in range(nd1)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
+def copy(usm_ary, /, *, order="K"):
+    """copy(ary, order="K")
+
+    Creates a copy of given instance of :class:`dpctl.tensor.usm_ndarray`.
+
+    Args:
+        ary (usm_ndarray):
+            Input array
+        order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional):
+            Controls the memory layout of the output array
+    Returns:
+        usm_ndarray:
+            A copy of the input array.
+
+    Memory layout of the copy is controlled by ``order`` keyword,
+    following NumPy's conventions. The ``order`` keywords can be
+    one of the following:
+
+    .. list-table::
+
+        * - ``"C"``
+          - C-contiguous memory layout
+        * - ``"F"``
+          - Fortran-contiguous memory layout
+        * - ``"A"``
+          - Fortran-contiguous if the input array is also Fortran-contiguous,
+            otherwise C-contiguous
+        * - ``"K"``
+          - match the layout of ``usm_ary`` as closely as possible.
+
+    """
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
+        )
+    order = order[0].upper()
+    if not isinstance(usm_ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
+        )
+    copy_order = "C"
+    if order == "C":
+        pass
+    elif order == "F":
+        copy_order = order
+    elif order == "A":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    elif order == "K":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    else:
+        raise ValueError(
+            "Unrecognized value of the order keyword. "
+            "Recognized values are 'A', 'C', 'F', or 'K'"
+        )
+    if order == "K":
+        R = _empty_like_orderK(usm_ary, usm_ary.dtype)
+    else:
+        R = dpt.usm_ndarray(
+            usm_ary.shape,
+            dtype=usm_ary.dtype,
+            buffer=usm_ary.usm_type,
+            order=copy_order,
+            buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
+        )
+    _copy_same_shape(R, usm_ary)
+    return R
+
+
+def astype(
+    usm_ary, newdtype, /, *, order="K", casting="unsafe", copy=True, device=None
+):
+    """astype(array, new_dtype, order="K", casting="unsafe", \
+            copy=True, device=None)
+
+    Returns a copy of the :class:`dpctl.tensor.usm_ndarray`, cast to a
+    specified type.
+
+    Args:
+        array (usm_ndarray):
+            An input array.
+        new_dtype (dtype):
+            The data type of the resulting array. If `None`, gives default
+            floating point type supported by device where the resulting array
+            will be located.
+        order ({"C", "F", "A", "K"}, optional):
+            Controls memory layout of the resulting array if a copy
+            is returned.
+        casting ({'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional):
+            Controls what kind of data casting may occur. Please see
+            :meth:`numpy.ndarray.astype` for description of casting modes.
+        copy (bool, optional):
+            By default, `astype` always returns a newly allocated array.
+            If this keyword is set to `False`, a view of the input array
+            may be returned when possible.
+        device (object): array API specification of device where the
+            output array is created. Device can be specified by
+            a filter selector string, an instance of
+            :class:`dpctl.SyclDevice`, an instance of
+            :class:`dpctl.SyclQueue`, or an instance of
+            :class:`dpctl.tensor.Device`. If the value is `None`,
+            returned array is created on the same device as `array`.
+            Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            An array with requested data type.
+
+    A view can be returned, if possible, when `copy=False` is used.
+    """
+    if not isinstance(usm_ary, dpt.usm_ndarray):
+        return TypeError(
+            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
+        )
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
+        )
+    order = order[0].upper()
+    ary_dtype = usm_ary.dtype
+    if device is not None:
+        if not isinstance(device, dpctl.SyclQueue):
+            if isinstance(device, dpt.Device):
+                device = device.sycl_queue
+            else:
+                device = dpt.Device.create_device(device).sycl_queue
+        d = device.sycl_device
+        target_dtype = _get_dtype(newdtype, device)
+        if not _dtype_supported_by_device_impl(
+            target_dtype, d.has_aspect_fp16, d.has_aspect_fp64
+        ):
+            raise ValueError(
+                f"Requested dtype '{target_dtype}' is not supported by the "
+                "target device"
+            )
+        usm_ary = usm_ary.to_device(device)
+    else:
+        target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue)
+
+    if not dpt.can_cast(ary_dtype, target_dtype, casting=casting):
+        raise TypeError(
+            f"Can not cast from {ary_dtype} to {newdtype} "
+            f"according to rule {casting}."
+        )
+    c_contig = usm_ary.flags.c_contiguous
+    f_contig = usm_ary.flags.f_contiguous
+    needs_copy = copy or not ary_dtype == target_dtype
+    if not needs_copy and (order != "K"):
+        # ensure that order="F" for C-contig input triggers copy,
+        # and order="C" for F-contig input triggers copy too.
+        # 1D arrays which are both C- and F- contig should not
+        # force copying for neither order="F", nor order="C", see gh-1926
+        needs_copy = (
+            c_contig and not f_contig and order not in ["A", "C"]
+        ) or (not c_contig and f_contig and order not in ["A", "F"])
+    if not needs_copy:
+        return usm_ary
+    copy_order = "C"
+    if order == "C":
+        pass
+    elif order == "F":
+        copy_order = order
+    elif order == "A":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    elif order == "K":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    else:
+        raise ValueError(
+            "Unrecognized value of the order keyword. "
+            "Recognized values are 'A', 'C', 'F', or 'K'"
+        )
+    if order == "K":
+        R = _empty_like_orderK(usm_ary, target_dtype)
+    else:
+        R = dpt.usm_ndarray(
+            usm_ary.shape,
+            dtype=target_dtype,
+            buffer=usm_ary.usm_type,
+            order=copy_order,
+            buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
+        )
+    _copy_from_usm_ndarray_to_usm_ndarray(R, usm_ary)
+    return R
diff --git a/dpnp/tensor/_ctors.py b/dpnp/tensor/_ctors.py
new file mode 100644
index 000000000000..b6e28afdc9e7
--- /dev/null
+++ b/dpnp/tensor/_ctors.py
@@ -0,0 +1,1972 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+from numbers import Number
+
+import dpctl
+import dpctl.memory as dpm
+import numpy as np
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+
+from ._copy_utils import (
+    _empty_like_orderK,
+    _from_numpy_empty_like_orderK,
+)
+from ._data_types import _get_dtype
+from ._device import normalize_queue_device
+from ._usmarray import _is_object_with_buffer_protocol
+
+__doc__ = "Implementation of creation functions in :module:`dpctl.tensor`"
+
+_empty_tuple = ()
+_host_set = frozenset([None])
+
+
+def _array_info_dispatch(obj):
+    if isinstance(obj, dpt.usm_ndarray):
+        return obj.shape, obj.dtype, frozenset([obj.sycl_queue])
+    if isinstance(obj, np.ndarray):
+        return obj.shape, obj.dtype, _host_set
+    if isinstance(obj, range):
+        return (len(obj),), int, _host_set
+    if isinstance(obj, bool):
+        return _empty_tuple, bool, _host_set
+    if isinstance(obj, float):
+        return _empty_tuple, float, _host_set
+    if isinstance(obj, int):
+        return _empty_tuple, int, _host_set
+    if isinstance(obj, complex):
+        return _empty_tuple, complex, _host_set
+    if isinstance(
+        obj,
+        (
+            list,
+            tuple,
+        ),
+    ):
+        return _array_info_sequence(obj)
+    if _is_object_with_buffer_protocol(obj):
+        np_obj = np.array(obj)
+        return np_obj.shape, np_obj.dtype, _host_set
+    if hasattr(obj, "__usm_ndarray__"):
+        usm_ar = obj.__usm_ndarray__
+        if isinstance(usm_ar, dpt.usm_ndarray):
+            return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue])
+    if hasattr(obj, "__sycl_usm_array_interface__"):
+        usm_ar = _usm_ndarray_from_suai(obj)
+        return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue])
+
+
+def _array_info_sequence(li):
+    if not isinstance(li, (list, tuple, range)):
+        raise TypeError(f"Expected list, tuple, or range, got {type(li)}")
+    n = len(li)
+    dim = None
+    dt = None
+    device = frozenset()
+    for el in li:
+        el_dim, el_dt, el_dev = _array_info_dispatch(el)
+        if dim is None:
+            dim = el_dim
+            dt = np.promote_types(el_dt, el_dt)
+            device = device.union(el_dev)
+        elif el_dim == dim:
+            dt = np.promote_types(dt, el_dt)
+            device = device.union(el_dev)
+        else:
+            raise ValueError(f"Inconsistent dimensions, {dim} and {el_dim}")
+    if dim is None:
+        dim = ()
+        dt = float
+        device = _host_set
+    return (n,) + dim, dt, device
+
+
+def _asarray_from_numpy_ndarray(
+    ary, dtype=None, usm_type=None, sycl_queue=None, order="K"
+):
+    if not isinstance(ary, np.ndarray):
+        raise TypeError(f"Expected numpy.ndarray, got {type(ary)}")
+    if usm_type is None:
+        usm_type = "device"
+    copy_q = normalize_queue_device(sycl_queue=None, device=sycl_queue)
+    if ary.dtype.char not in "?bBhHiIlLqQefdFD":
+        raise TypeError(
+            f"Numpy array of data type {ary.dtype} is not supported. "
+            "Please convert the input to an array with numeric data type."
+        )
+    if dtype is None:
+        # deduce device-representable output data type
+        dtype = _map_to_device_dtype(ary.dtype, copy_q)
+    _ensure_native_dtype_device_support(dtype, copy_q.sycl_device)
+    f_contig = ary.flags["F"]
+    c_contig = ary.flags["C"]
+    fc_contig = f_contig or c_contig
+    if order == "A":
+        order = "F" if f_contig and not c_contig else "C"
+    if order == "K" and fc_contig:
+        order = "C" if c_contig else "F"
+    if order == "K":
+        # new USM allocation
+        res = _from_numpy_empty_like_orderK(ary, dtype, usm_type, copy_q)
+    else:
+        res = dpt.usm_ndarray(
+            ary.shape,
+            dtype=dtype,
+            buffer=usm_type,
+            order=order,
+            buffer_ctor_kwargs={"queue": copy_q},
+        )
+    res[...] = ary
+    return res
+
+
+def _asarray_from_seq(
+    seq_obj,
+    seq_shape,
+    seq_dt,
+    alloc_q,
+    exec_q,
+    dtype=None,
+    usm_type=None,
+    order="C",
+):
+    """`seq_obj` is a sequence"""
+    if usm_type is None:
+        usm_types_in_seq = []
+        _usm_types_walker(seq_obj, usm_types_in_seq)
+        usm_type = dpt.get_coerced_usm_type(usm_types_in_seq)
+    dpt.validate_usm_type(usm_type)
+    if dtype is None:
+        dtype = _map_to_device_dtype(seq_dt, alloc_q)
+    else:
+        _mapped_dt = _map_to_device_dtype(dtype, alloc_q)
+        if _mapped_dt != dtype:
+            raise ValueError(
+                f"Device {alloc_q.sycl_device} "
+                f"does not support {dtype} natively."
+            )
+        dtype = _mapped_dt
+    if order in "KA":
+        order = "C"
+    if isinstance(exec_q, dpctl.SyclQueue):
+        res = dpt.empty(
+            seq_shape,
+            dtype=dtype,
+            usm_type=usm_type,
+            sycl_queue=alloc_q,
+            order=order,
+        )
+        _manager = SequentialOrderManager[exec_q]
+        _device_copy_walker(seq_obj, res, _manager)
+        return res
+    else:
+        res = dpt.empty(
+            seq_shape,
+            dtype=dtype,
+            usm_type=usm_type,
+            sycl_queue=alloc_q,
+            order=order,
+        )
+        _copy_through_host_walker(seq_obj, res)
+        return res
+
+
+def _asarray_from_seq_single_device(
+    obj,
+    seq_shape,
+    seq_dt,
+    seq_dev,
+    dtype=None,
+    usm_type=None,
+    sycl_queue=None,
+    order="C",
+):
+    if sycl_queue is None:
+        exec_q = seq_dev
+        alloc_q = seq_dev
+    else:
+        exec_q = dpt.get_execution_queue(
+            (
+                sycl_queue,
+                seq_dev,
+            )
+        )
+        alloc_q = sycl_queue
+    return _asarray_from_seq(
+        obj,
+        seq_shape,
+        seq_dt,
+        alloc_q,
+        exec_q,
+        dtype=dtype,
+        usm_type=usm_type,
+        order=order,
+    )
+
+
+def _asarray_from_usm_ndarray(
+    usm_ndary,
+    dtype=None,
+    copy=None,
+    usm_type=None,
+    sycl_queue=None,
+    order="K",
+):
+    if not isinstance(usm_ndary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpnp.tensor.usm_ndarray, got {type(usm_ndary)}"
+        )
+    if usm_type is None:
+        usm_type = usm_ndary.usm_type
+    if sycl_queue is not None:
+        exec_q = dpt.get_execution_queue([usm_ndary.sycl_queue, sycl_queue])
+        copy_q = normalize_queue_device(sycl_queue=sycl_queue, device=exec_q)
+    else:
+        copy_q = usm_ndary.sycl_queue
+    if dtype is None:
+        dtype = _map_to_device_dtype(usm_ndary.dtype, copy_q)
+    # Conditions for zero copy:
+    can_zero_copy = copy is not True
+    #    dtype is unchanged
+    can_zero_copy = can_zero_copy and dtype == usm_ndary.dtype
+    #    USM allocation type is unchanged
+    can_zero_copy = can_zero_copy and usm_type == usm_ndary.usm_type
+    #    sycl_queue is unchanged
+    can_zero_copy = can_zero_copy and copy_q is usm_ndary.sycl_queue
+    #    order is unchanged
+    c_contig = usm_ndary.flags.c_contiguous
+    f_contig = usm_ndary.flags.f_contiguous
+    fc_contig = usm_ndary.flags.forc
+    if can_zero_copy:
+        if order == "C" and c_contig:
+            pass
+        elif order == "F" and f_contig:
+            pass
+        elif order == "A" and fc_contig:
+            pass
+        elif order == "K":
+            pass
+        else:
+            can_zero_copy = False
+    if copy is False and can_zero_copy is False:
+        raise ValueError("asarray(..., copy=False) is not possible")
+    if can_zero_copy:
+        return usm_ndary
+    if order == "A":
+        order = "F" if f_contig and not c_contig else "C"
+    if order == "K" and fc_contig:
+        order = "C" if c_contig else "F"
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, copy_q.sycl_device)
+        res = _empty_like_orderK(usm_ndary, dtype, usm_type, copy_q)
+    else:
+        _ensure_native_dtype_device_support(dtype, copy_q.sycl_device)
+        res = dpt.usm_ndarray(
+            usm_ndary.shape,
+            dtype=dtype,
+            buffer=usm_type,
+            order=order,
+            buffer_ctor_kwargs={"queue": copy_q},
+        )
+    eq = dpt.get_execution_queue([usm_ndary.sycl_queue, copy_q])
+    if eq is not None:
+        _manager = SequentialOrderManager[eq]
+        dep_evs = _manager.submitted_events
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=usm_ndary, dst=res, sycl_queue=eq, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+    else:
+        tmp = dpt.asnumpy(usm_ndary)
+        res[...] = tmp
+    return res
+
+
+def _cast_fill_val(fill_val, dt):
+    """
+    Casts the Python scalar `fill_val` to another Python type coercible to the
+    requested data type `dt`, if necessary.
+    """
+    val_type = type(fill_val)
+    if val_type in [float, complex] and np.issubdtype(dt, np.integer):
+        return int(fill_val.real)
+    elif val_type is complex and np.issubdtype(dt, np.floating):
+        return fill_val.real
+    elif val_type is int and np.issubdtype(dt, np.integer):
+        return _to_scalar(fill_val, dt)
+    else:
+        return fill_val
+
+
+def _coerce_and_infer_dt(*args, dt, sycl_queue, err_msg, allow_bool=False):
+    """Deduce arange type from sequence spec"""
+    nd, seq_dt, d = _array_info_sequence(args)
+    if d != _host_set or nd != (len(args),):
+        raise ValueError(err_msg)
+    dt = _get_dtype(dt, sycl_queue, ref_type=seq_dt)
+    if np.issubdtype(dt, np.integer):
+        return tuple(int(v) for v in args), dt
+    if np.issubdtype(dt, np.floating):
+        return tuple(float(v) for v in args), dt
+    if np.issubdtype(dt, np.complexfloating):
+        return tuple(complex(v) for v in args), dt
+    if allow_bool and dt.char == "?":
+        return tuple(bool(v) for v in args), dt
+    raise ValueError(f"Data type {dt} is not supported")
+
+
+def _copy_through_host_walker(seq_o, usm_res):
+    if isinstance(seq_o, dpt.usm_ndarray):
+        if (
+            dpt.get_execution_queue(
+                (
+                    usm_res.sycl_queue,
+                    seq_o.sycl_queue,
+                )
+            )
+            is None
+        ):
+            usm_res[...] = dpt.asnumpy(seq_o).copy()
+            return
+        else:
+            usm_res[...] = seq_o
+    if hasattr(seq_o, "__usm_ndarray__"):
+        usm_arr = seq_o.__usm_ndarray__
+        if isinstance(usm_arr, dpt.usm_ndarray):
+            _copy_through_host_walker(usm_arr, usm_res)
+            return
+    if hasattr(seq_o, "__sycl_usm_array_interface__"):
+        usm_ar = _usm_ndarray_from_suai(seq_o)
+        if (
+            dpt.get_execution_queue(
+                (
+                    usm_res.sycl_queue,
+                    usm_ar.sycl_queue,
+                )
+            )
+            is None
+        ):
+            usm_res[...] = dpt.asnumpy(usm_ar).copy()
+        else:
+            usm_res[...] = usm_ar
+        return
+    if _is_object_with_buffer_protocol(seq_o):
+        np_ar = np.asarray(seq_o)
+        usm_res[...] = np_ar
+        return
+    if isinstance(seq_o, (list, tuple)):
+        for i, el in enumerate(seq_o):
+            _copy_through_host_walker(el, usm_res[i])
+        return
+    usm_res[...] = np.asarray(seq_o)
+
+
+def _device_copy_walker(seq_o, res, _manager):
+    if isinstance(seq_o, dpt.usm_ndarray):
+        exec_q = res.sycl_queue
+        deps = _manager.submitted_events
+        ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=seq_o, dst=res, sycl_queue=exec_q, depends=deps
+        )
+        _manager.add_event_pair(ht_ev, cpy_ev)
+        return
+    if hasattr(seq_o, "__usm_ndarray__"):
+        usm_arr = seq_o.__usm_ndarray__
+        if isinstance(usm_arr, dpt.usm_ndarray):
+            _device_copy_walker(usm_arr, res, _manager)
+            return
+    if hasattr(seq_o, "__sycl_usm_array_interface__"):
+        usm_ar = _usm_ndarray_from_suai(seq_o)
+        exec_q = res.sycl_queue
+        deps = _manager.submitted_events
+        ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=usm_ar, dst=res, sycl_queue=exec_q, depends=deps
+        )
+        _manager.add_event_pair(ht_ev, cpy_ev)
+        return
+    if isinstance(seq_o, (list, tuple)):
+        for i, el in enumerate(seq_o):
+            _device_copy_walker(el, res[i], _manager)
+        return
+    raise TypeError
+
+
+def _ensure_native_dtype_device_support(dtype, dev) -> None:
+    """Check that dtype is natively supported by device.
+
+    Arg:
+        dtype:
+            Elemental data-type
+        dev (:class:`dpctl.SyclDevice`):
+            The device about which the query is being made.
+    Returns:
+        None
+    Raise:
+        ValueError:
+            if device does not natively support this `dtype`.
+    """
+    if dtype in [dpt.float64, dpt.complex128] and not dev.has_aspect_fp64:
+        raise ValueError(
+            f"Device {dev.name} does not provide native support "
+            "for double-precision floating point type."
+        )
+    if (
+        dtype
+        in [
+            dpt.float16,
+        ]
+        and not dev.has_aspect_fp16
+    ):
+        raise ValueError(
+            f"Device {dev.name} does not provide native support "
+            "for half-precision floating point type."
+        )
+
+
+def _get_arange_length(start, stop, step):
+    """Compute length of arange sequence"""
+    span = stop - start
+    if hasattr(step, "__float__") and hasattr(span, "__float__"):
+        return _round_for_arange(span / step)
+    tmp = span / step
+    if hasattr(tmp, "__complex__"):
+        tmp = complex(tmp)
+        tmp = tmp.real
+    else:
+        tmp = float(tmp)
+    return _round_for_arange(tmp)
+
+
+def _map_to_device_dtype(dt, q):
+    dtc = dt.char
+    if dtc == "?" or np.issubdtype(dt, np.integer):
+        return dt
+    d = q.sycl_device
+    if np.issubdtype(dt, np.floating):
+        if dtc == "f":
+            return dt
+        if dtc == "d" and d.has_aspect_fp64:
+            return dt
+        if dtc == "e" and d.has_aspect_fp16:
+            return dt
+        return dpt.dtype("f4")
+    if np.issubdtype(dt, np.complexfloating):
+        if dtc == "F":
+            return dt
+        if dtc == "D" and d.has_aspect_fp64:
+            return dt
+        return dpt.dtype("c8")
+    raise RuntimeError(f"Unrecognized data type '{dt}' encountered.")
+
+
+def _normalize_order(order, arr):
+    """
+    Utility function for processing the `order` keyword of array-like
+    constructors, which support `"K"` and `"A"` orders.
+    """
+    arr_flags = arr.flags
+    f_contig = arr_flags["F"]
+    c_contig = arr_flags["C"]
+    if order == "A":
+        order = "F" if f_contig and not c_contig else "C"
+    if order == "K" and (f_contig or c_contig):
+        order = "C" if c_contig else "F"
+    return order
+
+
+def _round_for_arange(tmp):
+    k = int(tmp)
+    if k >= 0 and float(k) < tmp:
+        tmp = tmp + 1
+    return tmp
+
+
+def _to_scalar(obj, sc_ty):
+    """A way to convert object to NumPy scalar type.
+    Raises OverflowError if obj can not be represented
+    using the requested scalar type.
+    """
+    zd_arr = np.asarray(obj, dtype=sc_ty)
+    return zd_arr[()]
+
+
+def _usm_ndarray_from_suai(obj):
+    sua_iface = obj.__sycl_usm_array_interface__
+    membuf = dpm.as_usm_memory(obj)
+    ary = dpt.usm_ndarray(
+        sua_iface["shape"],
+        dtype=sua_iface["typestr"],
+        buffer=membuf,
+        strides=sua_iface.get("strides", None),
+    )
+    _data_field = sua_iface["data"]
+    if isinstance(_data_field, tuple) and len(_data_field) > 1:
+        ro_field = _data_field[1]
+    else:
+        ro_field = False
+    if ro_field:
+        ary.flags["W"] = False
+    return ary
+
+
+def _usm_types_walker(o, usm_types_list):
+    if isinstance(o, dpt.usm_ndarray):
+        usm_types_list.append(o.usm_type)
+        return
+    if hasattr(o, "__usm_ndarray__"):
+        usm_arr = o.__usm_ndarray__
+        if isinstance(usm_arr, dpt.usm_ndarray):
+            usm_types_list.append(usm_arr.usm_type)
+            return
+    if hasattr(o, "__sycl_usm_array_interface__"):
+        usm_ar = _usm_ndarray_from_suai(o)
+        usm_types_list.append(usm_ar.usm_type)
+        return
+    if _is_object_with_buffer_protocol(o):
+        return
+    if isinstance(o, (int, bool, float, complex)):
+        return
+    if isinstance(o, (list, tuple, range)):
+        for el in o:
+            _usm_types_walker(el, usm_types_list)
+        return
+    raise TypeError
+
+
+def arange(
+    start,
+    /,
+    stop=None,
+    step=1,
+    *,
+    dtype=None,
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    Returns evenly spaced values within the half-open interval [start, stop)
+    as a one-dimensional array.
+
+    Args:
+        start:
+            Starting point of the interval
+        stop:
+            Ending point of the interval. Default: ``None``
+        step: Increment of the returned sequence. Default: ``1``
+        dtype: Output array data type. Default: ``None``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Array populated with evenly spaced values.
+    """
+    if stop is None:
+        stop = start
+        start = 0
+    if step is None:
+        step = 1
+    dpt.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    is_bool = False
+    if dtype:
+        is_bool = (dtype is bool) or (dpt.dtype(dtype) == dpt.bool)
+    _, dt = _coerce_and_infer_dt(
+        start,
+        stop,
+        step,
+        dt=dpt.int8 if is_bool else dtype,
+        sycl_queue=sycl_queue,
+        err_msg="start, stop, and step must be Python scalars",
+        allow_bool=False,
+    )
+    try:
+        tmp = _get_arange_length(start, stop, step)
+        sh = max(int(tmp), 0)
+    except TypeError:
+        sh = 0
+    if is_bool and sh > 2:
+        raise ValueError("no fill-function for boolean data type")
+    res = dpt.usm_ndarray(
+        (sh,),
+        dtype=dt,
+        buffer=usm_type,
+        order="C",
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    sc_ty = dt.type
+    _first = _to_scalar(start, sc_ty)
+    if sh > 1:
+        _second = _to_scalar(start + step, sc_ty)
+        if dt in [dpt.uint8, dpt.uint16, dpt.uint32, dpt.uint64]:
+            int64_ty = dpt.int64.type
+            _step = int64_ty(_second) - int64_ty(_first)
+        else:
+            _step = _second - _first
+        _step = sc_ty(_step)
+    else:
+        _step = sc_ty(1)
+    _start = _first
+    _manager = SequentialOrderManager[sycl_queue]
+    # populating newly allocated array, no task dependencies
+    hev, lin_ev = ti._linspace_step(_start, _step, res, sycl_queue)
+    _manager.add_event_pair(hev, lin_ev)
+    if is_bool:
+        res_out = dpt.usm_ndarray(
+            (sh,),
+            dtype=dpt.bool,
+            buffer=usm_type,
+            order="C",
+            buffer_ctor_kwargs={"queue": sycl_queue},
+        )
+        hev_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=res, dst=res_out, sycl_queue=sycl_queue, depends=[lin_ev]
+        )
+        _manager.add_event_pair(hev_cpy, cpy_ev)
+        return res_out
+    return res
+
+
+def asarray(
+    obj,
+    /,
+    *,
+    dtype=None,
+    device=None,
+    copy=None,
+    usm_type=None,
+    sycl_queue=None,
+    order="K",
+):
+    """
+    Converts input object to :class:`dpctl.tensor.usm_ndarray`.
+
+    Args:
+        obj: Python object to convert. Can be an instance of
+            :class:`dpctl.tensor.usm_ndarray`,
+            an object representing SYCL USM allocation and implementing
+            ``__sycl_usm_array_interface__`` protocol, an instance
+            of :class:`numpy.ndarray`, an object supporting Python buffer
+            protocol, a Python scalar, or a (possibly nested) sequence of
+            Python scalars.
+        dtype (data type, optional):
+            output array data type. If ``dtype`` is
+            ``None``, the output array data type is inferred from data types in
+            ``obj``. Default: ``None``
+        copy (`bool`, optional):
+            boolean indicating whether or not to copy the
+            input. If ``True``, always creates a copy. If ``False``, the
+            need to copy raises :exc:`ValueError`. If ``None``, tries to reuse
+            existing memory allocations if possible, but allows to perform
+            a copy otherwise. Default: ``None``
+        order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional):
+            memory layout of the output array. Default: ``"K"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Array created from input object.
+    """
+    # 1. Check that copy is a valid keyword
+    if copy not in [None, True, False]:
+        raise TypeError(
+            "Recognized copy keyword values should be True, False, or None"
+        )
+    # 2. Check that dtype is None, or a valid dtype
+    if dtype is not None:
+        dtype = dpt.dtype(dtype)
+    # 3. Validate order
+    if not isinstance(order, str):
+        raise TypeError(
+            f"Expected order keyword to be of type str, got {type(order)}"
+        )
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
+        )
+    order = order[0].upper()
+    # 4. Check that usm_type is None, or a valid value
+    dpt.validate_usm_type(usm_type, allow_none=True)
+    # 5. Normalize device/sycl_queue [keep it None if was None]
+    if device is not None or sycl_queue is not None:
+        sycl_queue = normalize_queue_device(
+            sycl_queue=sycl_queue, device=device
+        )
+
+    # handle instance(obj, usm_ndarray)
+    if isinstance(obj, dpt.usm_ndarray):
+        return _asarray_from_usm_ndarray(
+            obj,
+            dtype=dtype,
+            copy=copy,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+            order=order,
+        )
+    if hasattr(obj, "__usm_ndarray__"):
+        usm_arr = obj.__usm_ndarray__
+        if isinstance(usm_arr, dpt.usm_ndarray):
+            return _asarray_from_usm_ndarray(
+                usm_arr,
+                dtype=dtype,
+                copy=copy,
+                usm_type=usm_type,
+                sycl_queue=sycl_queue,
+                order=order,
+            )
+    if hasattr(obj, "__sycl_usm_array_interface__"):
+        ary = _usm_ndarray_from_suai(obj)
+        return _asarray_from_usm_ndarray(
+            ary,
+            dtype=dtype,
+            copy=copy,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+            order=order,
+        )
+    if isinstance(obj, np.ndarray):
+        if copy is False:
+            raise ValueError(
+                "Converting numpy.ndarray to usm_ndarray requires a copy"
+            )
+        return _asarray_from_numpy_ndarray(
+            obj,
+            dtype=dtype,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+            order=order,
+        )
+    if _is_object_with_buffer_protocol(obj):
+        if copy is False:
+            raise ValueError(
+                f"Converting {type(obj)} to usm_ndarray requires a copy"
+            )
+        return _asarray_from_numpy_ndarray(
+            np.array(obj),
+            dtype=dtype,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+            order=order,
+        )
+    if isinstance(obj, (list, tuple, range)):
+        if copy is False:
+            raise ValueError(
+                "Converting Python sequence to usm_ndarray requires a copy"
+            )
+        seq_shape, seq_dt, devs = _array_info_sequence(obj)
+        if devs == _host_set:
+            return _asarray_from_numpy_ndarray(
+                np.asarray(obj, dtype=dtype, order=order),
+                dtype=dtype,
+                usm_type=usm_type,
+                sycl_queue=sycl_queue,
+                order=order,
+            )
+        elif len(devs) == 1:
+            seq_dev = list(devs)[0]
+            return _asarray_from_seq_single_device(
+                obj,
+                seq_shape,
+                seq_dt,
+                seq_dev,
+                dtype=dtype,
+                usm_type=usm_type,
+                sycl_queue=sycl_queue,
+                order=order,
+            )
+        elif len(devs) > 1:
+            devs = [dev for dev in devs if dev is not None]
+            if sycl_queue is None:
+                if len(devs) == 1:
+                    alloc_q = devs[0]
+                else:
+                    raise dpt.ExecutionPlacementError(
+                        "Please specify `device` or `sycl_queue` keyword "
+                        "argument to determine where to allocate the "
+                        "resulting array."
+                    )
+            else:
+                alloc_q = sycl_queue
+            return _asarray_from_seq(
+                obj,
+                seq_shape,
+                seq_dt,
+                alloc_q,
+                #  force copying via host
+                None,
+                dtype=dtype,
+                usm_type=usm_type,
+                order=order,
+            )
+    if copy is False:
+        raise ValueError(
+            f"Converting {type(obj)} to usm_ndarray requires a copy"
+        )
+    # obj is a scalar, create 0d array
+    return _asarray_from_numpy_ndarray(
+        np.asarray(obj, dtype=dtype),
+        dtype=dtype,
+        usm_type=usm_type,
+        sycl_queue=sycl_queue,
+        order="C",
+    )
+
+
+def empty(
+    shape,
+    *,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    Creates :class:`dpctl.tensor.usm_ndarray` from uninitialized
+    USM allocation.
+
+    Args:
+        shape (Tuple[int], int):
+            Dimensions of the array to be created.
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. The ``None`` value creates an
+            array of floating point data type. Default: ``None``
+        order (``"C"``, or ``F"``):
+            memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Created empty array.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    dpt.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = _get_dtype(dtype, sycl_queue)
+    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+    res = dpt.usm_ndarray(
+        shape,
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    return res
+
+
+def empty_like(
+    x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None
+):
+    """
+    Returns an uninitialized :class:`dpctl.tensor.usm_ndarray` with the
+    same `shape` as the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            Input array from which to derive the output array shape.
+        dtype (optional):
+            data type of the array. Can be a typestring,
+            a :class:`numpy.dtype` object, NumPy char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", "F", "A", or "K"):
+            memory layout for the array. Default: ``"K"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Created empty array with uninitialized memory.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
+    if (
+        not isinstance(order, str)
+        or len(order) == 0
+        or order[0] not in "CcFfAaKk"
+    ):
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
+        )
+    order = order[0].upper()
+    if dtype is None:
+        dtype = x.dtype
+    if usm_type is None:
+        usm_type = x.usm_type
+    dpt.validate_usm_type(usm_type, allow_none=False)
+    if device is None and sycl_queue is None:
+        device = x.device
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = dpt.dtype(dtype)
+    order = _normalize_order(order, x)
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        return _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+    else:
+        shape = x.shape
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        res = dpt.usm_ndarray(
+            shape,
+            dtype=dtype,
+            buffer=usm_type,
+            order=order,
+            buffer_ctor_kwargs={"queue": sycl_queue},
+        )
+        return res
+
+
+def eye(
+    n_rows,
+    n_cols=None,
+    /,
+    *,
+    k=0,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    eye(n_rows, n_cols=None, /, *, k=0, dtype=None, \
+        device=None, usm_type="device", sycl_queue=None)
+
+    Creates :class:`dpctl.tensor.usm_ndarray` with ones on the `k`-th
+    diagonal.
+
+    Args:
+        n_rows (int):
+            number of rows in the output array.
+        n_cols (int, optional):
+            number of columns in the output array. If ``None``,
+            ``n_cols = n_rows``. Default: ``None``
+        k (int):
+            index of the diagonal, with ``0`` as the main diagonal.
+            A positive value of ``k`` is a superdiagonal, a negative value
+            is a subdiagonal.
+            Raises :exc:`TypeError` if ``k`` is not an integer.
+            Default: ``0``
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string, or
+            a NumPy scalar type. Default: ``None``
+        order ("C" or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            A diagonal matrix.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    n_rows = operator.index(n_rows)
+    n_cols = n_rows if n_cols is None else operator.index(n_cols)
+    k = operator.index(k)
+    if k >= n_cols or -k >= n_rows:
+        return dpt.zeros(
+            (n_rows, n_cols),
+            dtype=dtype,
+            order=order,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+    dpt.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = _get_dtype(dtype, sycl_queue)
+    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+    res = dpt.usm_ndarray(
+        (n_rows, n_cols),
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    if n_rows != 0 and n_cols != 0:
+        _manager = SequentialOrderManager[sycl_queue]
+        hev, eye_ev = ti._eye(k, dst=res, sycl_queue=sycl_queue)
+        _manager.add_event_pair(hev, eye_ev)
+    return res
+
+
+def _validate_fill_value(fill_val):
+    """Validates that `fill_val` is a numeric or boolean scalar."""
+    # TODO: verify if `np.True_` and `np.False_` should be instances of
+    # Number in NumPy, like other NumPy scalars and like Python bools
+    # check for `np.bool_` separately as NumPy<2 has no `np.bool`
+    if not isinstance(fill_val, Number) and not isinstance(fill_val, np.bool_):
+        raise TypeError(
+            f"array cannot be filled with scalar of type {type(fill_val)}"
+        )
+
+
+def full(
+    shape,
+    fill_value,
+    *,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type=None,
+    sycl_queue=None,
+):
+    """
+    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
+    shape and filled with `fill_value`.
+
+    Args:
+        shape (tuple):
+            Dimensions of the array to be created.
+        fill_value (int,float,complex,usm_ndarray):
+            fill value
+        dtype (optional): data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            New array initialized with given value.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    dpt.validate_usm_type(usm_type, allow_none=True)
+
+    if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
+        if (
+            isinstance(fill_value, dpt.usm_ndarray)
+            and sycl_queue is None
+            and device is None
+        ):
+            sycl_queue = fill_value.sycl_queue
+        else:
+            sycl_queue = normalize_queue_device(
+                sycl_queue=sycl_queue, device=device
+            )
+        X = dpt.asarray(
+            fill_value,
+            dtype=dtype,
+            order=order,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+        return dpt.copy(dpt.broadcast_to(X, shape), order=order)
+    else:
+        _validate_fill_value(fill_value)
+
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    usm_type = usm_type if usm_type is not None else "device"
+    dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value))
+    res = dpt.usm_ndarray(
+        shape,
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    fill_value = _cast_fill_val(fill_value, dtype)
+
+    _manager = SequentialOrderManager[sycl_queue]
+    # populating new allocation, no dependent events
+    hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
+    _manager.add_event_pair(hev, full_ev)
+    return res
+
+
+def full_like(
+    x,
+    /,
+    fill_value,
+    *,
+    dtype=None,
+    order="K",
+    device=None,
+    usm_type=None,
+    sycl_queue=None,
+):
+    """full_like(x, fill_value, dtype=None, order="K", \
+                 device=None, usm_type=None, sycl_queue=None)
+
+    Returns a new :class:`dpctl.tensor.usm_ndarray` filled with `fill_value`
+    and having the same `shape` as the input array `x`.
+
+    Args:
+        x (usm_ndarray): Input array from which to derive the output array
+            shape.
+        fill_value: the value to fill output array with
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string, or a
+            NumPy scalar type. If ``dtype`` is ``None``, the output array data
+            type is inferred from ``x``. Default: ``None``
+        order ("C", "F", "A", or "K"):
+            memory layout for the array. Default: ``"K"``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            New array initialized with given value.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
+    if (
+        not isinstance(order, str)
+        or len(order) == 0
+        or order[0] not in "CcFfAaKk"
+    ):
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
+        )
+    order = order[0].upper()
+    if dtype is None:
+        dtype = x.dtype
+    if usm_type is None:
+        usm_type = x.usm_type
+    dpt.validate_usm_type(usm_type, allow_none=False)
+    if device is None and sycl_queue is None:
+        device = x.device
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    sh = x.shape
+    dtype = dpt.dtype(dtype)
+    order = _normalize_order(order, x)
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
+            X = dpt.asarray(
+                fill_value,
+                dtype=dtype,
+                order=order,
+                usm_type=usm_type,
+                sycl_queue=sycl_queue,
+            )
+            X = dpt.broadcast_to(X, sh)
+            res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+            _manager = SequentialOrderManager[sycl_queue]
+            # order copy after tasks populating X
+            dep_evs = _manager.submitted_events
+            hev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=X, dst=res, sycl_queue=sycl_queue, depends=dep_evs
+            )
+            _manager.add_event_pair(hev, copy_ev)
+            return res
+        else:
+            _validate_fill_value(fill_value)
+
+        dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value))
+        res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+        fill_value = _cast_fill_val(fill_value, dtype)
+        _manager = SequentialOrderManager[sycl_queue]
+        # populating new allocation, no dependent events
+        hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
+        _manager.add_event_pair(hev, full_ev)
+        return res
+    else:
+        return full(
+            sh,
+            fill_value,
+            dtype=dtype,
+            order=order,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+
+
+def linspace(
+    start,
+    stop,
+    /,
+    num,
+    *,
+    dtype=None,
+    device=None,
+    endpoint=True,
+    sycl_queue=None,
+    usm_type="device",
+):
+    """
+    linspace(start, stop, num, dtype=None, device=None, endpoint=True, \
+        sycl_queue=None, usm_type="device")
+
+    Returns :class:`dpctl.tensor.usm_ndarray` array populated with
+    evenly spaced numbers of specified interval.
+
+    Args:
+        start:
+            the start of the interval.
+        stop:
+            the end of the interval. If the ``endpoint`` is ``False``, the
+            function generates ``num+1`` evenly spaced points starting
+            with ``start`` and ending with ``stop`` and exclude the
+            ``stop`` from the returned array such that the returned array
+            consists of evenly spaced numbers over the half-open interval
+            ``[start, stop)``. If ``endpoint`` is ``True``, the output
+            array consists of evenly spaced numbers over the closed
+            interval ``[start, stop]``. Default: ``True``
+        num (int):
+            number of samples. Must be a non-negative integer; otherwise,
+            the function raises ``ValueError`` exception.
+        dtype:
+            output array data type. Should be a floating data type.
+            If ``dtype`` is ``None``, the output array must be the default
+            floating point data type for target device.
+            Default: ``None``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+        endpoint: boolean indicating whether to include ``stop`` in the
+            interval. Default: ``True``
+
+    Returns:
+        usm_ndarray:
+            Array populated with evenly spaced numbers in the requested
+            interval.
+    """
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dpt.validate_usm_type(usm_type, allow_none=False)
+    if endpoint not in [True, False]:
+        raise TypeError("endpoint keyword argument must be of boolean type")
+
+    num = operator.index(num)
+    if num < 0:
+        raise ValueError("Number of points must be non-negative")
+
+    _, dt = _coerce_and_infer_dt(
+        start,
+        stop,
+        dt=dtype,
+        sycl_queue=sycl_queue,
+        err_msg="start and stop must be Python scalars.",
+        allow_bool=True,
+    )
+
+    int_dt = None
+    if np.issubdtype(dt, np.integer):
+        if dtype is not None:
+            int_dt = dt
+        dt = ti.default_device_fp_type(sycl_queue)
+        dt = dpt.dtype(dt)
+        start = float(start)
+        stop = float(stop)
+
+    res = dpt.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue)
+    _manager = SequentialOrderManager[sycl_queue]
+    hev, la_ev = ti._linspace_affine(
+        start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue
+    )
+    _manager.add_event_pair(hev, la_ev)
+
+    return res if int_dt is None else dpt.astype(res, int_dt)
+
+
+def meshgrid(*arrays, indexing="xy"):
+    """
+    Creates list of :class:`dpctl.tensor.usm_ndarray` coordinate matrices
+    from vectors.
+
+    Args:
+        arrays (usm_ndarray):
+            an arbitrary number of one-dimensional arrays
+            representing grid coordinates. Each array should have the same
+            numeric data type.
+        indexing (``"xy"``, or ``"ij"``):
+            Cartesian (``"xy"``) or matrix (``"ij"``) indexing of output.
+            If provided zero or one one-dimensional vector(s) (i.e., the
+            zero- and one-dimensional cases, respectively), the ``indexing``
+            keyword has no effect and should be ignored. Default: ``"xy"``
+
+    Returns:
+        List[array]:
+            list of ``N`` arrays, where ``N`` is the number of
+            provided one-dimensional input arrays. Each returned array must
+            have rank ``N``.
+            For a set of ``n`` vectors with lengths ``N0``, ``N1``, ``N2``, ...
+            The cartesian indexing results in arrays of shape
+            ``(N1, N0, N2, ...)``, while the
+            matrix indexing results in arrays of shape
+            ``(N0, N1, N2, ...)``.
+            Default: ``"xy"``.
+
+    Raises:
+        ValueError: If vectors are not of the same data type, or are not
+            one-dimensional.
+
+    """
+    ref_dt = None
+    ref_unset = True
+    for array in arrays:
+        if not isinstance(array, dpt.usm_ndarray):
+            raise TypeError(
+                f"Expected instance of dpt.usm_ndarray, got {type(array)}."
+            )
+        if array.ndim != 1:
+            raise ValueError("All arrays must be one-dimensional.")
+        if ref_unset:
+            ref_unset = False
+            ref_dt = array.dtype
+        else:
+            if not ref_dt == array.dtype:
+                raise ValueError(
+                    "All arrays must be of the same numeric data type."
+                )
+    if indexing not in ["xy", "ij"]:
+        raise ValueError(
+            "Unrecognized indexing keyword value, expecting 'xy' or 'ij.'"
+        )
+    n = len(arrays)
+    if n == 0:
+        return []
+
+    sh = (-1,) + (1,) * (n - 1)
+
+    res = []
+    if n > 1 and indexing == "xy":
+        res.append(dpt.reshape(arrays[0], (1, -1) + sh[2:], copy=True))
+        res.append(dpt.reshape(arrays[1], sh, copy=True))
+        arrays, sh = arrays[2:], sh[-2:] + sh[:-2]
+
+    for array in arrays:
+        res.append(dpt.reshape(array, sh, copy=True))
+        sh = sh[-1:] + sh[:-1]
+
+    output = dpt.broadcast_arrays(*res)
+
+    return output
+
+
+def ones(
+    shape,
+    *,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """ones(shape, dtype=None, order="C", \
+            device=None, usm_type="device", sycl_queue=None)
+
+    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
+    shape and filled with ones.
+
+    Args:
+        shape (Tuple[int], int):
+            Dimensions of the array to be created.
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", or "F"): memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Created array initialized with ones.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    dpt.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = _get_dtype(dtype, sycl_queue)
+    res = dpt.usm_ndarray(
+        shape,
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    _manager = SequentialOrderManager[sycl_queue]
+    # populating new allocation, no dependent events
+    hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue)
+    _manager.add_event_pair(hev, full_ev)
+    return res
+
+
+def ones_like(
+    x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None
+):
+    """
+    Returns a new :class:`dpctl.tensor.usm_ndarray` filled with ones and
+    having the same `shape` as the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            Input array from which to derive the output array shape
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. Default: `None`
+        order ("C", "F", "A", or "K"):
+            memory layout for the array. Default: ``"C"``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            New array initialized with ones.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
+    if (
+        not isinstance(order, str)
+        or len(order) == 0
+        or order[0] not in "CcFfAaKk"
+    ):
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
+        )
+    order = order[0].upper()
+    if dtype is None:
+        dtype = x.dtype
+    if usm_type is None:
+        usm_type = x.usm_type
+    dpt.validate_usm_type(usm_type, allow_none=False)
+    if device is None and sycl_queue is None:
+        device = x.device
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = dpt.dtype(dtype)
+    order = _normalize_order(order, x)
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+        _manager = SequentialOrderManager[sycl_queue]
+        # populating new allocation, no dependent events
+        hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue)
+        _manager.add_event_pair(hev, full_ev)
+        return res
+    else:
+        sh = x.shape
+        return ones(
+            sh,
+            dtype=dtype,
+            order=order,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
+
+
+def tril(x, /, *, k=0):
+    """
+    Returns the lower triangular part of a matrix (or a stack of matrices)
+    ``x``.
+
+    The lower triangular part of the matrix is defined as the elements on and
+    below the specified diagonal ``k``.
+
+    Args:
+        x (usm_ndarray):
+            Input array
+        k (int, optional):
+            Specifies the diagonal above which to set
+            elements to zero. If ``k = 0``, the diagonal is the main diagonal.
+            If ``k < 0``, the diagonal is below the main diagonal.
+            If ``k > 0``, the diagonal is above the main diagonal.
+            Default: ``0``
+
+    Returns:
+        usm_ndarray:
+            A lower-triangular array or a stack of lower-triangular arrays.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected argument of type dpnp.tensor.usm_ndarray, "
+            f"got {type(x)}."
+        )
+
+    k = operator.index(k)
+
+    order = "F" if (x.flags.f_contiguous) else "C"
+
+    shape = x.shape
+    nd = x.ndim
+    if nd < 2:
+        raise ValueError("Array dimensions less than 2.")
+
+    q = x.sycl_queue
+    if k >= shape[nd - 1] - 1:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=res, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+    elif k < -shape[nd - 2]:
+        res = dpt.zeros(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+    else:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, tril_ev = ti._tril(
+            src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, tril_ev)
+
+    return res
+
+
+def triu(x, /, *, k=0):
+    """
+    Returns the upper triangular part of a matrix (or a stack of matrices)
+    ``x``.
+
+    The upper triangular part of the matrix is defined as the elements on and
+    above the specified diagonal ``k``.
+
+    Args:
+        x (usm_ndarray):
+            Input array
+        k (int, optional):
+            Specifies the diagonal below which to set
+            elements to zero. If ``k = 0``, the diagonal is the main diagonal.
+            If ``k < 0``, the diagonal is below the main diagonal.
+            If ``k > 0``, the diagonal is above the main diagonal.
+            Default: ``0``
+
+    Returns:
+        usm_ndarray:
+            An upper-triangular array or a stack of upper-triangular arrays.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected argument of type dpnp.tensor.usm_ndarray, "
+            f"got {type(x)}."
+        )
+
+    k = operator.index(k)
+
+    order = "F" if (x.flags.f_contiguous) else "C"
+
+    shape = x.shape
+    nd = x.ndim
+    if nd < 2:
+        raise ValueError("Array dimensions less than 2.")
+
+    q = x.sycl_queue
+    if k > shape[nd - 1]:
+        res = dpt.zeros(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+    elif k <= -shape[nd - 2] + 1:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=res, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+    else:
+        res = dpt.empty(
+            x.shape,
+            dtype=x.dtype,
+            order=order,
+            usm_type=x.usm_type,
+            sycl_queue=q,
+        )
+        _manager = SequentialOrderManager[q]
+        dep_evs = _manager.submitted_events
+        hev, triu_ev = ti._triu(
+            src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, triu_ev)
+
+    return res
+
+
+def zeros(
+    shape,
+    *,
+    dtype=None,
+    order="C",
+    device=None,
+    usm_type="device",
+    sycl_queue=None,
+):
+    """
+    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
+    shape and filled with zeros.
+
+    Args:
+        shape (Tuple[int], int):
+            Dimensions of the array to be created.
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string,
+            or a NumPy scalar type. Default: ``None``
+        order ("C", or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional): array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            Constructed array initialized with zeros.
+    """
+    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'F' or 'C'."
+        )
+    order = order[0].upper()
+    dpt.validate_usm_type(usm_type, allow_none=False)
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = _get_dtype(dtype, sycl_queue)
+    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+    res = dpt.usm_ndarray(
+        shape,
+        dtype=dtype,
+        buffer=usm_type,
+        order=order,
+        buffer_ctor_kwargs={"queue": sycl_queue},
+    )
+    _manager = SequentialOrderManager[sycl_queue]
+    # populating new allocation, no dependent events
+    hev, zeros_ev = ti._zeros_usm_ndarray(res, sycl_queue)
+    _manager.add_event_pair(hev, zeros_ev)
+
+    return res
+
+
+def zeros_like(
+    x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None
+):
+    """
+    Creates :class:`dpctl.tensor.usm_ndarray` from USM allocation
+    initialized with zeros.
+
+    Args:
+        x (usm_ndarray):
+            Input array from which to derive the shape of the
+            output array.
+        dtype (optional):
+            data type of the array. Can be typestring,
+            a :class:`numpy.dtype` object, :mod:`numpy` char string, or a
+            NumPy scalar type. If `None`, output array has the same data
+            type as the input array. Default: ``None``
+        order ("C", or "F"):
+            memory layout for the array. Default: ``"C"``
+        device (optional):
+            array API concept of device where the output array
+            is created. ``device`` can be ``None``, a oneAPI filter selector
+            string, an instance of :class:`dpctl.SyclDevice` corresponding to
+            a non-partitioned SYCL device, an instance of
+            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
+            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            Default: ``None``
+        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
+            The type of SYCL USM allocation for the output array.
+            Default: ``"device"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            The SYCL queue to use
+            for output array allocation and copying. ``sycl_queue`` and
+            ``device`` are complementary arguments, i.e. use one or another.
+            If both are specified, a :exc:`TypeError` is raised unless both
+            imply the same underlying SYCL queue to be used. If both are
+            ``None``, a cached queue targeting default-selected device is
+            used for allocation and population. Default: ``None``
+
+    Returns:
+        usm_ndarray:
+            New array initialized with zeros.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
+    if (
+        not isinstance(order, str)
+        or len(order) == 0
+        or order[0] not in "CcFfAaKk"
+    ):
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
+        )
+    order = order[0].upper()
+    if dtype is None:
+        dtype = x.dtype
+    if usm_type is None:
+        usm_type = x.usm_type
+    dpt.validate_usm_type(usm_type, allow_none=False)
+    if device is None and sycl_queue is None:
+        device = x.device
+    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    dtype = dpt.dtype(dtype)
+    order = _normalize_order(order, x)
+    if order == "K":
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
+        _manager = SequentialOrderManager[sycl_queue]
+        # populating new allocation, no dependent events
+        hev, full_ev = ti._full_usm_ndarray(0, res, sycl_queue)
+        _manager.add_event_pair(hev, full_ev)
+        return res
+    else:
+        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
+        sh = x.shape
+        return zeros(
+            sh,
+            dtype=dtype,
+            order=order,
+            device=device,
+            usm_type=usm_type,
+            sycl_queue=sycl_queue,
+        )
diff --git a/dpnp/tensor/_data_types.py b/dpnp/tensor/_data_types.py
new file mode 100644
index 000000000000..faf30ffdabd0
--- /dev/null
+++ b/dpnp/tensor/_data_types.py
@@ -0,0 +1,104 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from numpy import bool_ as np_bool_
+from numpy import complexfloating as np_complexfloating
+from numpy import dtype
+from numpy import floating as np_floating
+from numpy import integer as np_integer
+from numpy import issubdtype as np_issubdtype
+
+from ._tensor_impl import (
+    default_device_bool_type as ti_default_device_bool_type,
+)
+from ._tensor_impl import (
+    default_device_complex_type as ti_default_device_complex_type,
+)
+from ._tensor_impl import default_device_fp_type as ti_default_device_fp_type
+from ._tensor_impl import default_device_int_type as ti_default_device_int_type
+
+bool = dtype("bool")
+int8 = dtype("int8")
+int16 = dtype("int16")
+int32 = dtype("int32")
+int64 = dtype("int64")
+uint8 = dtype("uint8")
+uint16 = dtype("uint16")
+uint32 = dtype("uint32")
+uint64 = dtype("uint64")
+float16 = dtype("float16")
+float32 = dtype("float32")
+float64 = dtype("float64")
+complex64 = dtype("complex64")
+complex128 = dtype("complex128")
+
+
+def _get_dtype(inp_dt, sycl_obj, ref_type=None):
+    """
+    Type inference utility to construct data type
+    object with defaults based on reference type.
+
+    _get_dtype is used by dpctl.tensor.asarray
+    to infer data type of the output array from the
+    input sequence.
+    """
+    if inp_dt is None:
+        if ref_type in [None, float] or np_issubdtype(ref_type, np_floating):
+            fp_dt = ti_default_device_fp_type(sycl_obj)
+            return dtype(fp_dt)
+        if ref_type in [bool, np_bool_]:
+            bool_dt = ti_default_device_bool_type(sycl_obj)
+            return dtype(bool_dt)
+        if ref_type is int or np_issubdtype(ref_type, np_integer):
+            int_dt = ti_default_device_int_type(sycl_obj)
+            return dtype(int_dt)
+        if ref_type is complex or np_issubdtype(ref_type, np_complexfloating):
+            cfp_dt = ti_default_device_complex_type(sycl_obj)
+            return dtype(cfp_dt)
+        raise TypeError(f"Reference type {ref_type} not recognized.")
+    return dtype(inp_dt)
+
+
+__all__ = [
+    "dtype",
+    "_get_dtype",
+    "bool",
+    "int8",
+    "uint8",
+    "int16",
+    "uint16",
+    "int32",
+    "uint32",
+    "int64",
+    "uint64",
+    "float16",
+    "float32",
+    "float64",
+    "complex64",
+    "complex128",
+]
diff --git a/dpnp/tensor/_device.py b/dpnp/tensor/_device.py
new file mode 100644
index 000000000000..5f2725c74855
--- /dev/null
+++ b/dpnp/tensor/_device.py
@@ -0,0 +1,197 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+import dpctl
+from dpctl._sycl_device_factory import _cached_default_device
+from dpctl._sycl_queue_manager import get_device_cached_queue
+
+from ._compute_follows_data import get_execution_queue
+
+__doc__ = "Implementation of array API mandated Device class"
+
+
+class Device:
+    """
+    An object representing Data-API concept of device.
+
+    This is a wrapper around :class:`dpctl.SyclQueue` with custom
+    formatting. The class does not have public constructor,
+    but a class method :meth:`dpctl.tensor.Device.create_device` to construct
+    it from `device` keyword argument in Array-API functions.
+
+    Instance can be queried for ``sycl_queue``, ``sycl_context``,
+    or ``sycl_device``.
+    """
+
+    __device_queue_map__ = {}
+    sycl_queue_ = None
+
+    def __new__(cls, *args, **kwargs):
+        raise TypeError("No public constructor")
+
+    @classmethod
+    def create_device(cls, device=None):
+        """Device.create_device(device=None)
+
+        Creates instance of Device from argument.
+
+        Args:
+            device:
+                Device specification, i.e. `None`, :class:`.Device`,
+                :class:`dpctl.SyclQueue`, or a :class:`dpctl.SyclDevice`
+                corresponding to a root SYCL device.
+        Raises:
+            ValueError: if an instance of :class:`dpctl.SycDevice` corresponding
+                        to a sub-device was specified as the argument
+            SyclQueueCreationError: if :class:`dpctl.SyclQueue` could not be
+                                    created from the argument
+        """
+        dev = device
+        obj = super().__new__(cls)
+        if isinstance(dev, Device):
+            obj.sycl_queue_ = dev.sycl_queue
+        elif isinstance(dev, dpctl.SyclQueue):
+            obj.sycl_queue_ = dev
+        elif isinstance(dev, dpctl.SyclDevice):
+            par = dev.parent_device
+            if par is None:
+                obj.sycl_queue_ = get_device_cached_queue(dev)
+            else:
+                raise ValueError(
+                    f"Using non-root device {dev} to specify offloading "
+                    "target is ambiguous. Please use dpctl.SyclQueue "
+                    "targeting this device"
+                )
+        else:
+            if dev is None:
+                _dev = _cached_default_device()
+            else:
+                _dev = dpctl.SyclDevice(dev)
+            obj.sycl_queue_ = get_device_cached_queue(_dev)
+        return obj
+
+    @property
+    def sycl_queue(self):
+        """:class:`dpctl.SyclQueue` used to offload to this :class:`.Device`."""
+        return self.sycl_queue_
+
+    @property
+    def sycl_context(self):
+        """:class:`dpctl.SyclContext` associated with this :class:`.Device`."""
+        return self.sycl_queue_.sycl_context
+
+    @property
+    def sycl_device(self):
+        """:class:`dpctl.SyclDevice` targeted by this :class:`.Device`."""
+        return self.sycl_queue_.sycl_device
+
+    def __repr__(self):
+        try:
+            sd = self.sycl_device
+        except AttributeError as exc:
+            raise ValueError(
+                f"Instance of {self.__class__} is not initialized"
+            ) from exc
+        try:
+            fs = sd.filter_string
+            return f"Device({fs})"
+        except TypeError:
+            # This is a sub-device
+            return repr(self.sycl_queue)
+
+    def print_device_info(self):
+        """Outputs information about targeted SYCL device"""
+        self.sycl_device.print_device_info()
+
+    def wait(self):
+        """Call ``wait`` method of the underlying ``sycl_queue``."""
+        self.sycl_queue_.wait()
+
+    def __eq__(self, other):
+        """Equality comparison based on underlying ``sycl_queue``."""
+        if isinstance(other, Device):
+            return self.sycl_queue.__eq__(other.sycl_queue)
+        elif isinstance(other, dpctl.SyclQueue):
+            return self.sycl_queue.__eq__(other)
+        return False
+
+    def __hash__(self):
+        """Compute object's hash value."""
+        return self.sycl_queue.__hash__()
+
+
+def normalize_queue_device(sycl_queue=None, device=None):
+    """normalize_queue_device(sycl_queue=None, device=None)
+
+    Utility to process exclusive keyword arguments 'device'
+    and 'sycl_queue' in functions of `dpctl.tensor`.
+
+    Args:
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            explicitly indicates where USM allocation is done
+            and the population code (if any) is executed.
+            Value `None` is interpreted as get the SYCL queue
+            from `device` keyword, or use default queue.
+            Default: None
+        device (string, :class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue,
+            :class:`dpctl.tensor.Device`, optional):
+            array-API keyword indicating non-partitioned SYCL device
+            where array is allocated.
+
+    Returns
+        :class:`dpctl.SyclQueue` object implied by either of provided
+        keywords. If both are None, `dpctl.SyclQueue()` is returned.
+        If both are specified and imply the same queue, `sycl_queue`
+        is returned.
+
+    Raises:
+        TypeError: if argument is not of the expected type, or keywords
+            imply incompatible queues.
+    """
+    q = sycl_queue
+    d = device
+    if q is None:
+        d = Device.create_device(d)
+        return d.sycl_queue
+    if not isinstance(q, dpctl.SyclQueue):
+        raise TypeError(f"Expected dpctl.SyclQueue, got {type(q)}")
+    if d is None:
+        return q
+    d = Device.create_device(d)
+    qq = get_execution_queue(
+        (
+            q,
+            d.sycl_queue,
+        )
+    )
+    if qq is None:
+        raise TypeError(
+            "sycl_queue and device keywords can not be both specified"
+        )
+    return qq
diff --git a/dpnp/tensor/_dldevice_conversions.py b/dpnp/tensor/_dldevice_conversions.py
new file mode 100644
index 000000000000..595a280689a5
--- /dev/null
+++ b/dpnp/tensor/_dldevice_conversions.py
@@ -0,0 +1,52 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from dpctl._sycl_device import SyclDevice
+
+from ._usmarray import DLDeviceType
+
+
+def dldevice_to_sycl_device(dl_dev: tuple):
+    if isinstance(dl_dev, tuple):
+        if len(dl_dev) != 2:
+            raise ValueError("dldevice tuple must have length 2")
+    else:
+        raise TypeError(
+            f"dl_dev is expected to be a 2-tuple, got " f"{type(dl_dev)}"
+        )
+    if dl_dev[0] != DLDeviceType.kDLOneAPI:
+        raise ValueError("dldevice type must be kDLOneAPI")
+    return SyclDevice(str(dl_dev[1]))
+
+
+def sycl_device_to_dldevice(dev: SyclDevice):
+    if not isinstance(dev, SyclDevice):
+        raise TypeError(
+            "dev is expected to be a SyclDevice, got " f"{type(dev)}"
+        )
+    return (DLDeviceType.kDLOneAPI, dev.get_device_id())
diff --git a/dpnp/tensor/_dlpack.pxd b/dpnp/tensor/_dlpack.pxd
new file mode 100644
index 000000000000..75378bfa7a92
--- /dev/null
+++ b/dpnp/tensor/_dlpack.pxd
@@ -0,0 +1,73 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+cdef extern from "numpy/npy_no_deprecated_api.h":
+    pass
+from dpctl._sycl_device cimport SyclDevice
+from numpy cimport ndarray
+
+from ._usmarray cimport usm_ndarray
+
+
+cdef extern from "dlpack/dlpack.h" nogil:
+    int device_CPU "kDLCPU"
+    int device_CUDA "kDLCUDA"
+    int device_CUDAHost "kDLCUDAHost"
+    int device_CUDAManaged "kDLCUDAManaged"
+    int device_DLROCM "kDLROCM"
+    int device_ROCMHost "kDLROCMHost"
+    int device_OpenCL "kDLOpenCL"
+    int device_Vulkan "kDLVulkan"
+    int device_Metal "kDLMetal"
+    int device_VPI "kDLVPI"
+    int device_OneAPI "kDLOneAPI"
+    int device_WebGPU "kDLWebGPU"
+    int device_Hexagon "kDLHexagon"
+    int device_MAIA "kDLMAIA"
+    int device_Trn "kDLTrn"
+
+cpdef object to_dlpack_capsule(usm_ndarray array) except +
+cpdef object to_dlpack_versioned_capsule(
+    usm_ndarray array, bint copied
+) except +
+cpdef object numpy_to_dlpack_versioned_capsule(
+    ndarray array, bint copied
+) except +
+cpdef object from_dlpack_capsule(object dltensor) except +
+
+cdef class DLPackCreationError(Exception):
+    """
+    A DLPackCreateError exception is raised when constructing
+    DLPack capsule from `usm_ndarray` based on a USM allocation
+    on a partitioned SYCL device.
+    """
+    pass
diff --git a/dpnp/tensor/_dlpack.pyx b/dpnp/tensor/_dlpack.pyx
new file mode 100644
index 000000000000..947377d3a660
--- /dev/null
+++ b/dpnp/tensor/_dlpack.pyx
@@ -0,0 +1,1243 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+cdef extern from "numpy/npy_no_deprecated_api.h":
+    pass
+
+cimport cpython
+cimport dpctl as c_dpctl
+cimport dpctl.memory as c_dpmem
+from dpctl._backend cimport (
+    DPCTLDevice_Delete,
+    DPCTLDevice_GetParentDevice,
+    DPCTLSyclDeviceRef,
+    DPCTLSyclUSMRef,
+)
+from dpctl._sycl_queue_manager cimport get_device_cached_queue
+from libc cimport stdlib
+from libc.stdint cimport int64_t, uint8_t, uint16_t, uint32_t, uint64_t
+from numpy cimport ndarray
+
+from ._usmarray cimport (
+    USM_ARRAY_C_CONTIGUOUS,
+    USM_ARRAY_F_CONTIGUOUS,
+    USM_ARRAY_WRITABLE,
+    usm_ndarray,
+)
+
+import ctypes
+
+import dpctl
+import dpctl.memory as dpmem
+import numpy as np
+
+from ._device import Device
+
+
+cdef extern from "dlpack/dlpack.h" nogil:
+    cdef int DLPACK_MAJOR_VERSION
+
+    cdef int DLPACK_MINOR_VERSION
+
+    cdef int DLPACK_FLAG_BITMASK_READ_ONLY
+
+    cdef int DLPACK_FLAG_BITMASK_IS_COPIED
+
+    ctypedef struct DLPackVersion:
+        uint32_t major
+        uint32_t minor
+
+    cdef enum DLDeviceType:
+        kDLCPU
+        kDLCUDA
+        kDLCUDAHost
+        kDLCUDAManaged
+        kDLROCM
+        kDLROCMHost
+        kDLOpenCL
+        kDLVulkan
+        kDLMetal
+        kDLVPI
+        kDLOneAPI
+        kDLWebGPU
+        kDLHexagon
+        kDLMAIA
+        kDLTrn
+
+    ctypedef struct DLDevice:
+        DLDeviceType device_type
+        int device_id
+
+    cdef enum DLDataTypeCode:
+        kDLInt
+        kDLUInt
+        kDLFloat
+        kDLBfloat
+        kDLComplex
+        kDLBool
+        kDLFloat8_e3m4
+        kDLFloat8_e4m3
+        kDLFloat8_e4m3b11fnuz
+        kDLFloat8_e4m3fn
+        kDLFloat8_e4m3fnuz
+        kDLFloat8_e5m2
+        kDLFloat8_e5m2fnuz
+        kDLFloat8_e8m0fnu
+        kDLFloat6_e2m3fn
+        kDLFloat6_e3m2fn
+        kDLFloat4_e2m1fn
+
+    ctypedef struct DLDataType:
+        uint8_t code
+        uint8_t bits
+        uint16_t lanes
+
+    ctypedef struct DLTensor:
+        void *data
+        DLDevice device
+        int ndim
+        DLDataType dtype
+        int64_t *shape
+        int64_t *strides
+        uint64_t byte_offset
+
+    ctypedef struct DLManagedTensor:
+        DLTensor dl_tensor
+        void *manager_ctx
+        void (*deleter)(DLManagedTensor *)  # noqa: E211
+
+    ctypedef struct DLManagedTensorVersioned:
+        DLPackVersion version
+        void *manager_ctx
+        void (*deleter)(DLManagedTensorVersioned *)  # noqa: E211
+        uint64_t flags
+        DLTensor dl_tensor
+
+
+def get_build_dlpack_version():
+    """
+    Returns a tuple of integers representing the `major` and `minor`
+    version of DLPack :module:`dpctl.tensor` was built with.
+    This tuple can be passed as the `max_version` argument to
+    `__dlpack__` to guarantee module:`dpctl.tensor` can properly
+    consume capsule.
+
+    Returns:
+        Tuple[int, int]
+            A tuple of integers representing the `major` and `minor`
+            version of DLPack used to build :module:`dpctl.tensor`.
+    """
+    return (DLPACK_MAJOR_VERSION, DLPACK_MINOR_VERSION)
+
+
+cdef void _pycapsule_deleter(object dlt_capsule) noexcept:
+    cdef DLManagedTensor *dlm_tensor = NULL
+    if cpython.PyCapsule_IsValid(dlt_capsule, "dltensor"):
+        dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
+            dlt_capsule, "dltensor")
+        dlm_tensor.deleter(dlm_tensor)
+
+
+cdef void _managed_tensor_deleter(
+    DLManagedTensor *dlm_tensor
+) noexcept with gil:
+    if dlm_tensor is not NULL:
+        # we only delete shape, because we make single allocation to
+        # accommodate both shape and strides if strides are needed
+        stdlib.free(dlm_tensor.dl_tensor.shape)
+        cpython.Py_DECREF(<object>dlm_tensor.manager_ctx)
+        dlm_tensor.manager_ctx = NULL
+        stdlib.free(dlm_tensor)
+
+
+cdef void _pycapsule_versioned_deleter(object dlt_capsule) noexcept:
+    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
+    if cpython.PyCapsule_IsValid(dlt_capsule, "dltensor_versioned"):
+        dlmv_tensor = <DLManagedTensorVersioned*>cpython.PyCapsule_GetPointer(
+            dlt_capsule, "dltensor_versioned")
+        dlmv_tensor.deleter(dlmv_tensor)
+
+
+cdef void _managed_tensor_versioned_deleter(
+    DLManagedTensorVersioned *dlmv_tensor
+) noexcept with gil:
+    if dlmv_tensor is not NULL:
+        # we only delete shape, because we make single allocation to
+        # accommodate both shape and strides if strides are needed
+        stdlib.free(dlmv_tensor.dl_tensor.shape)
+        cpython.Py_DECREF(<object>dlmv_tensor.manager_ctx)
+        dlmv_tensor.manager_ctx = NULL
+        stdlib.free(dlmv_tensor)
+
+
+cdef object _get_default_context(c_dpctl.SyclDevice dev):
+    try:
+        default_context = dev.sycl_platform.default_context
+    except RuntimeError:
+        # RT does not support default_context
+        default_context = None
+
+    return default_context
+
+cdef int get_array_dlpack_device_id(
+    usm_ndarray usm_ary
+) except -1:
+    """Finds ordinal number of the parent of device where array
+    was allocated.
+    """
+    cdef c_dpctl.SyclQueue ary_sycl_queue
+    cdef c_dpctl.SyclDevice ary_sycl_device
+    cdef DPCTLSyclDeviceRef pDRef = NULL
+    cdef int device_id = -1
+
+    ary_sycl_queue = usm_ary.get_sycl_queue()
+    ary_sycl_device = ary_sycl_queue.get_sycl_device()
+
+    default_context = _get_default_context(ary_sycl_device)
+    if default_context is None:
+        # check that ary_sycl_device is a non-partitioned device
+        pDRef = DPCTLDevice_GetParentDevice(ary_sycl_device.get_device_ref())
+        if pDRef is not NULL:
+            DPCTLDevice_Delete(pDRef)
+            raise DLPackCreationError(
+                "to_dlpack_capsule: DLPack can only export arrays allocated "
+                "on non-partitioned SYCL devices on platforms where "
+                "default_context oneAPI extension is not supported."
+            )
+    else:
+        if not usm_ary.sycl_context == default_context:
+            raise DLPackCreationError(
+                "to_dlpack_capsule: DLPack can only export arrays based on USM "
+                "allocations bound to a default platform SYCL context"
+            )
+    device_id = ary_sycl_device.get_device_id()
+
+    if device_id < 0:
+        raise DLPackCreationError(
+            "get_array_dlpack_device_id: failed to determine device_id"
+        )
+
+    return device_id
+
+
+cpdef to_dlpack_capsule(usm_ndarray usm_ary):
+    """
+    to_dlpack_capsule(usm_ary)
+
+    Constructs named Python capsule object referencing
+    instance of ``DLManagedTensor`` from
+    :class:`dpctl.tensor.usm_ndarray` instance.
+
+    Args:
+        usm_ary: An instance of :class:`dpctl.tensor.usm_ndarray`
+    Returns:
+        A new capsule with name ``"dltensor"`` that contains
+        a pointer to ``DLManagedTensor`` struct.
+    Raises:
+        DLPackCreationError: when array can be represented as
+            DLPack tensor. This may happen when array was allocated
+            on a partitioned sycl device, or its USM allocation is
+            not bound to the platform default SYCL context.
+        MemoryError: when host allocation to needed for ``DLManagedTensor``
+            did not succeed.
+        ValueError: when array elements data type could not be represented
+            in ``DLManagedTensor``.
+    """
+    cdef DLManagedTensor *dlm_tensor = NULL
+    cdef DLTensor *dl_tensor = NULL
+    cdef int nd = usm_ary.get_ndim()
+    cdef char *data_ptr = usm_ary.get_data()
+    cdef Py_ssize_t *shape_ptr = NULL
+    cdef Py_ssize_t *strides_ptr = NULL
+    cdef int64_t *shape_strides_ptr = NULL
+    cdef int i = 0
+    cdef int device_id = -1
+    cdef int flags = 0
+    cdef Py_ssize_t element_offset = 0
+    cdef Py_ssize_t byte_offset = 0
+    cdef Py_ssize_t si = 1
+
+    ary_base = usm_ary.get_base()
+
+    device_id = get_array_dlpack_device_id(usm_ary)
+
+    dlm_tensor = <DLManagedTensor *> stdlib.malloc(
+        sizeof(DLManagedTensor))
+    if dlm_tensor is NULL:
+        raise MemoryError(
+            "to_dlpack_capsule: Could not allocate memory for DLManagedTensor"
+        )
+    if nd > 0:
+        shape_strides_ptr = <int64_t *>stdlib.malloc((sizeof(int64_t) * 2) * nd)
+        if shape_strides_ptr is NULL:
+            stdlib.free(dlm_tensor)
+            raise MemoryError(
+                "to_dlpack_capsule: Could not allocate memory for shape/strides"
+            )
+        shape_ptr = usm_ary.get_shape()
+        for i in range(nd):
+            shape_strides_ptr[i] = shape_ptr[i]
+        strides_ptr = usm_ary.get_strides()
+        flags = usm_ary.flags_
+        if strides_ptr:
+            for i in range(nd):
+                shape_strides_ptr[nd + i] = strides_ptr[i]
+        else:
+            if flags & USM_ARRAY_C_CONTIGUOUS:
+                si = 1
+                for i in range(nd - 1, -1, -1):
+                    shape_strides_ptr[nd + i] = si
+                    si = si * shape_ptr[i]
+            elif flags & USM_ARRAY_F_CONTIGUOUS:
+                si = 1
+                for i in range(0, nd):
+                    shape_strides_ptr[nd + i] = si
+                    si = si * shape_ptr[i]
+            else:
+                stdlib.free(shape_strides_ptr)
+                stdlib.free(dlm_tensor)
+                raise BufferError(
+                    "to_dlpack_capsule: Invalid array encountered "
+                    "when building strides"
+                )
+
+            strides_ptr = <Py_ssize_t *>&shape_strides_ptr[nd]
+
+    ary_dt = usm_ary.dtype
+    ary_dtk = ary_dt.kind
+    element_offset = usm_ary.get_offset()
+    byte_offset = element_offset * (<Py_ssize_t>ary_dt.itemsize)
+
+    dl_tensor = &dlm_tensor.dl_tensor
+    dl_tensor.data = <void*>(data_ptr - byte_offset)
+    dl_tensor.ndim = nd
+    dl_tensor.byte_offset = <uint64_t>byte_offset
+    dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL
+    dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL
+    dl_tensor.device.device_type = kDLOneAPI
+    dl_tensor.device.device_id = device_id
+    dl_tensor.dtype.lanes = <uint16_t>1
+    dl_tensor.dtype.bits = <uint8_t>(ary_dt.itemsize * 8)
+    if (ary_dtk == "b"):
+        dl_tensor.dtype.code = <uint8_t>kDLBool
+    elif (ary_dtk == "u"):
+        dl_tensor.dtype.code = <uint8_t>kDLUInt
+    elif (ary_dtk == "i"):
+        dl_tensor.dtype.code = <uint8_t>kDLInt
+    elif (ary_dtk == "f"):
+        dl_tensor.dtype.code = <uint8_t>kDLFloat
+    elif (ary_dtk == "c"):
+        dl_tensor.dtype.code = <uint8_t>kDLComplex
+    else:
+        stdlib.free(shape_strides_ptr)
+        stdlib.free(dlm_tensor)
+        raise ValueError("Unrecognized array data type")
+
+    dlm_tensor.manager_ctx = <void*>ary_base
+    cpython.Py_INCREF(ary_base)
+    dlm_tensor.deleter = _managed_tensor_deleter
+
+    return cpython.PyCapsule_New(dlm_tensor, "dltensor", _pycapsule_deleter)
+
+
+cpdef to_dlpack_versioned_capsule(usm_ndarray usm_ary, bint copied):
+    """
+    to_dlpack_versioned_capsule(usm_ary, copied)
+
+    Constructs named Python capsule object referencing
+    instance of ``DLManagedTensorVersioned`` from
+    :class:`dpctl.tensor.usm_ndarray` instance.
+
+    Args:
+        usm_ary: An instance of :class:`dpctl.tensor.usm_ndarray`
+        copied: A bint representing whether the data was previously
+            copied in order to set the flags with the is-copied
+            bitmask.
+    Returns:
+        A new capsule with name ``"dltensor_versioned"`` that
+        contains a pointer to ``DLManagedTensorVersioned`` struct.
+    Raises:
+        DLPackCreationError: when array can be represented as
+            DLPack tensor. This may happen when array was allocated
+            on a partitioned sycl device, or its USM allocation is
+            not bound to the platform default SYCL context.
+        MemoryError: when host allocation to needed for
+            ``DLManagedTensorVersioned`` did not succeed.
+        ValueError: when array elements data type could not be represented
+            in ``DLManagedTensorVersioned``.
+    """
+    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
+    cdef DLTensor *dl_tensor = NULL
+    cdef uint32_t dlmv_flags = 0
+    cdef int nd = usm_ary.get_ndim()
+    cdef char *data_ptr = usm_ary.get_data()
+    cdef Py_ssize_t *shape_ptr = NULL
+    cdef Py_ssize_t *strides_ptr = NULL
+    cdef int64_t *shape_strides_ptr = NULL
+    cdef int i = 0
+    cdef int device_id = -1
+    cdef int flags = 0
+    cdef Py_ssize_t element_offset = 0
+    cdef Py_ssize_t byte_offset = 0
+    cdef Py_ssize_t si = 1
+
+    ary_base = usm_ary.get_base()
+
+    # Find ordinal number of the parent device
+    device_id = get_array_dlpack_device_id(usm_ary)
+
+    dlmv_tensor = <DLManagedTensorVersioned *> stdlib.malloc(
+        sizeof(DLManagedTensorVersioned))
+    if dlmv_tensor is NULL:
+        raise MemoryError(
+            "to_dlpack_versioned_capsule: Could not allocate memory "
+            "for DLManagedTensorVersioned"
+        )
+    if nd > 0:
+        shape_strides_ptr = <int64_t *>stdlib.malloc((sizeof(int64_t) * 2) * nd)
+        if shape_strides_ptr is NULL:
+            stdlib.free(dlmv_tensor)
+            raise MemoryError(
+                "to_dlpack_versioned_capsule: Could not allocate memory "
+                "for shape/strides"
+            )
+        # this can be a separate function for handling shapes and strides
+        shape_ptr = usm_ary.get_shape()
+        for i in range(nd):
+            shape_strides_ptr[i] = shape_ptr[i]
+        strides_ptr = usm_ary.get_strides()
+        flags = usm_ary.flags_
+        if strides_ptr:
+            for i in range(nd):
+                shape_strides_ptr[nd + i] = strides_ptr[i]
+        else:
+            if flags & USM_ARRAY_C_CONTIGUOUS:
+                si = 1
+                for i in range(nd - 1, -1, -1):
+                    shape_strides_ptr[nd + i] = si
+                    si = si * shape_ptr[i]
+            elif flags & USM_ARRAY_F_CONTIGUOUS:
+                si = 1
+                for i in range(0, nd):
+                    shape_strides_ptr[nd + i] = si
+                    si = si * shape_ptr[i]
+            else:
+                stdlib.free(shape_strides_ptr)
+                stdlib.free(dlmv_tensor)
+                raise BufferError(
+                    "to_dlpack_versioned_capsule: Invalid array encountered "
+                    "when building strides"
+                )
+
+            strides_ptr = <Py_ssize_t *>&shape_strides_ptr[nd]
+
+    # this can all be a function for building the dl_tensor
+    # object (separate from dlm/dlmv)
+    ary_dt = usm_ary.dtype
+    ary_dtk = ary_dt.kind
+    element_offset = usm_ary.get_offset()
+    byte_offset = element_offset * (<Py_ssize_t>ary_dt.itemsize)
+
+    dl_tensor = &dlmv_tensor.dl_tensor
+    dl_tensor.data = <void*>(data_ptr - byte_offset)
+    dl_tensor.ndim = nd
+    dl_tensor.byte_offset = <uint64_t>byte_offset
+    dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL
+    dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL
+    dl_tensor.device.device_type = kDLOneAPI
+    dl_tensor.device.device_id = device_id
+    dl_tensor.dtype.lanes = <uint16_t>1
+    dl_tensor.dtype.bits = <uint8_t>(ary_dt.itemsize * 8)
+    if (ary_dtk == "b"):
+        dl_tensor.dtype.code = <uint8_t>kDLBool
+    elif (ary_dtk == "u"):
+        dl_tensor.dtype.code = <uint8_t>kDLUInt
+    elif (ary_dtk == "i"):
+        dl_tensor.dtype.code = <uint8_t>kDLInt
+    elif (ary_dtk == "f"):
+        dl_tensor.dtype.code = <uint8_t>kDLFloat
+    elif (ary_dtk == "c"):
+        dl_tensor.dtype.code = <uint8_t>kDLComplex
+    else:
+        stdlib.free(shape_strides_ptr)
+        stdlib.free(dlmv_tensor)
+        raise ValueError("Unrecognized array data type")
+
+    # set flags down here
+    if copied:
+        dlmv_flags |= DLPACK_FLAG_BITMASK_IS_COPIED
+    if not (flags & USM_ARRAY_WRITABLE):
+        dlmv_flags |= DLPACK_FLAG_BITMASK_READ_ONLY
+    dlmv_tensor.flags = dlmv_flags
+
+    dlmv_tensor.version.major = DLPACK_MAJOR_VERSION
+    dlmv_tensor.version.minor = DLPACK_MINOR_VERSION
+
+    dlmv_tensor.manager_ctx = <void*>ary_base
+    cpython.Py_INCREF(ary_base)
+    dlmv_tensor.deleter = _managed_tensor_versioned_deleter
+
+    return cpython.PyCapsule_New(
+        dlmv_tensor, "dltensor_versioned", _pycapsule_versioned_deleter
+    )
+
+
+cpdef numpy_to_dlpack_versioned_capsule(ndarray npy_ary, bint copied):
+    """
+    to_dlpack_versioned_capsule(npy_ary, copied)
+
+    Constructs named Python capsule object referencing
+    instance of ``DLManagedTensorVersioned`` from
+    :class:`numpy.ndarray` instance.
+
+    Args:
+        npy_ary: An instance of :class:`numpy.ndarray`
+        copied: A bint representing whether the data was previously
+            copied in order to set the flags with the is-copied
+            bitmask.
+    Returns:
+        A new capsule with name ``"dltensor_versioned"`` that
+        contains a pointer to ``DLManagedTensorVersioned`` struct.
+    Raises:
+        DLPackCreationError: when array can be represented as
+            DLPack tensor.
+        MemoryError: when host allocation to needed for
+            ``DLManagedTensorVersioned`` did not succeed.
+        ValueError: when array elements data type could not be represented
+            in ``DLManagedTensorVersioned``.
+    """
+    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
+    cdef DLTensor *dl_tensor = NULL
+    cdef uint32_t dlmv_flags = 0
+    cdef int nd = npy_ary.ndim
+    cdef int64_t *shape_strides_ptr = NULL
+    cdef int i = 0
+    cdef Py_ssize_t byte_offset = 0
+    cdef int itemsize = npy_ary.itemsize
+
+    dlmv_tensor = <DLManagedTensorVersioned *> stdlib.malloc(
+        sizeof(DLManagedTensorVersioned))
+    if dlmv_tensor is NULL:
+        raise MemoryError(
+            "numpy_to_dlpack_versioned_capsule: Could not allocate memory "
+            "for DLManagedTensorVersioned"
+        )
+
+    shape = npy_ary.ctypes.shape_as(ctypes.c_int64)
+    strides = npy_ary.ctypes.strides_as(ctypes.c_int64)
+    if nd > 0:
+        if npy_ary.size != 1:
+            for i in range(nd):
+                if shape[i] != 1 and strides[i] % itemsize != 0:
+                    stdlib.free(dlmv_tensor)
+                    raise BufferError(
+                        "numpy_to_dlpack_versioned_capsule: DLPack cannot "
+                        "encode an array if strides are not a multiple of "
+                        "itemsize"
+                    )
+        shape_strides_ptr = <int64_t *>stdlib.malloc((sizeof(int64_t) * 2) * nd)
+        if shape_strides_ptr is NULL:
+            stdlib.free(dlmv_tensor)
+            raise MemoryError(
+                "numpy_to_dlpack_versioned_capsule: Could not allocate memory "
+                "for shape/strides"
+            )
+        for i in range(nd):
+            shape_strides_ptr[i] = shape[i]
+            shape_strides_ptr[nd + i] = strides[i] // itemsize
+
+    writable_flag = npy_ary.flags["W"]
+
+    ary_dt = npy_ary.dtype
+    ary_dtk = ary_dt.kind
+
+    dl_tensor = &dlmv_tensor.dl_tensor
+    dl_tensor.data = <void *> npy_ary.data
+    dl_tensor.ndim = nd
+    dl_tensor.byte_offset = <uint64_t>byte_offset
+    dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL
+    dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL
+    dl_tensor.device.device_type = kDLCPU
+    dl_tensor.device.device_id = 0
+    dl_tensor.dtype.lanes = <uint16_t>1
+    dl_tensor.dtype.bits = <uint8_t>(ary_dt.itemsize * 8)
+    if (ary_dtk == "b"):
+        dl_tensor.dtype.code = <uint8_t>kDLBool
+    elif (ary_dtk == "u"):
+        dl_tensor.dtype.code = <uint8_t>kDLUInt
+    elif (ary_dtk == "i"):
+        dl_tensor.dtype.code = <uint8_t>kDLInt
+    elif (ary_dtk == "f" and ary_dt.itemsize <= 8):
+        dl_tensor.dtype.code = <uint8_t>kDLFloat
+    elif (ary_dtk == "c" and ary_dt.itemsize <= 16):
+        dl_tensor.dtype.code = <uint8_t>kDLComplex
+    else:
+        stdlib.free(shape_strides_ptr)
+        stdlib.free(dlmv_tensor)
+        raise ValueError("Unrecognized array data type")
+
+    # set flags down here
+    if copied:
+        dlmv_flags |= DLPACK_FLAG_BITMASK_IS_COPIED
+    if not writable_flag:
+        dlmv_flags |= DLPACK_FLAG_BITMASK_READ_ONLY
+    dlmv_tensor.flags = dlmv_flags
+
+    dlmv_tensor.version.major = DLPACK_MAJOR_VERSION
+    dlmv_tensor.version.minor = DLPACK_MINOR_VERSION
+
+    dlmv_tensor.manager_ctx = <void*>npy_ary
+    cpython.Py_INCREF(npy_ary)
+    dlmv_tensor.deleter = _managed_tensor_versioned_deleter
+
+    return cpython.PyCapsule_New(
+        dlmv_tensor, "dltensor_versioned", _pycapsule_versioned_deleter
+    )
+
+
+cdef class _DLManagedTensorOwner:
+    """
+    Helper class managing the lifetime of the DLManagedTensor struct
+    transferred from a 'dlpack' capsule.
+    """
+    cdef DLManagedTensor * dlm_tensor
+
+    def __cinit__(self):
+        self.dlm_tensor = NULL
+
+    def __dealloc__(self):
+        if self.dlm_tensor:
+            self.dlm_tensor.deleter(self.dlm_tensor)
+            self.dlm_tensor = NULL
+
+    @staticmethod
+    cdef _DLManagedTensorOwner _create(DLManagedTensor *dlm_tensor_src):
+        cdef _DLManagedTensorOwner res
+        res = _DLManagedTensorOwner.__new__(_DLManagedTensorOwner)
+        res.dlm_tensor = dlm_tensor_src
+        return res
+
+
+cdef class _DLManagedTensorVersionedOwner:
+    """
+    Helper class managing the lifetime of the DLManagedTensorVersioned
+    struct transferred from a 'dlpack_versioned' capsule.
+    """
+    cdef DLManagedTensorVersioned * dlmv_tensor
+
+    def __cinit__(self):
+        self.dlmv_tensor = NULL
+
+    def __dealloc__(self):
+        if self.dlmv_tensor:
+            self.dlmv_tensor.deleter(self.dlmv_tensor)
+            self.dlmv_tensor = NULL
+
+    @staticmethod
+    cdef _DLManagedTensorVersionedOwner _create(
+        DLManagedTensorVersioned *dlmv_tensor_src
+    ):
+        cdef _DLManagedTensorVersionedOwner res
+        res = _DLManagedTensorVersionedOwner.__new__(
+            _DLManagedTensorVersionedOwner
+        )
+        res.dlmv_tensor = dlmv_tensor_src
+        return res
+
+
+cdef dict _numpy_array_interface_from_dl_tensor(DLTensor *dlt, bint ro_flag):
+    """Constructs a NumPy `__array_interface__` dictionary from a DLTensor."""
+    cdef int itemsize = 0
+
+    if dlt.dtype.lanes != 1:
+        raise BufferError(
+            "Can not import DLPack tensor with lanes != 1"
+        )
+    itemsize = dlt.dtype.bits // 8
+    shape = list()
+    if (dlt.strides is NULL):
+        strides = None
+        for dim in range(dlt.ndim):
+            shape.append(dlt.shape[dim])
+    else:
+        strides = list()
+        for dim in range(dlt.ndim):
+            shape.append(dlt.shape[dim])
+            # convert to byte-strides
+            strides.append(dlt.strides[dim] * itemsize)
+        strides = tuple(strides)
+    shape = tuple(shape)
+    if (dlt.dtype.code == kDLUInt):
+        ary_dt = "u" + str(itemsize)
+    elif (dlt.dtype.code == kDLInt):
+        ary_dt = "i" + str(itemsize)
+    elif (dlt.dtype.code == kDLFloat):
+        ary_dt = "f" + str(itemsize)
+    elif (dlt.dtype.code == kDLComplex):
+        ary_dt = "c" + str(itemsize)
+    elif (dlt.dtype.code == kDLBool):
+        ary_dt = "b" + str(itemsize)
+    else:
+        raise BufferError(
+            "Can not import DLPack tensor with type code {}.".format(
+                <object>dlt.dtype.code
+            )
+        )
+    typestr = "|" + ary_dt
+    return dict(
+        version=3,
+        shape=shape,
+        strides=strides,
+        data=(<size_t> dlt.data, True if ro_flag else False),
+        offset=dlt.byte_offset,
+        typestr=typestr,
+    )
+
+
+class _numpy_array_interface_wrapper:
+    """
+    Class that wraps a Python capsule and dictionary for consumption by NumPy.
+
+    Implementation taken from
+    https://github.com/dmlc/dlpack/blob/main/apps/numpy_dlpack/dlpack/to_numpy.py
+
+    Args:
+        array_interface:
+            A dictionary describing the underlying memory. Formatted
+            to match `numpy.ndarray.__array_interface__`.
+
+        pycapsule:
+            A Python capsule wrapping the dlpack tensor that will be
+            converted to numpy.
+    """
+
+    def __init__(self, array_interface, memory_owner) -> None:
+        self.__array_interface__ = array_interface
+        self._memory_owner = memory_owner
+
+
+cdef bint _is_kdlcpu_device(DLDevice *dev):
+    "Check if DLTensor.DLDevice denotes (kDLCPU, 0)"
+    return (dev[0].device_type == kDLCPU) and (dev[0].device_id == 0)
+
+
+cpdef object from_dlpack_capsule(object py_caps):
+    """
+    from_dlpack_capsule(py_caps)
+
+    Reconstructs instance of :class:`dpctl.tensor.usm_ndarray` from
+    named Python capsule object referencing instance of ``DLManagedTensor``
+    without copy. The instance forms a view in the memory of the tensor.
+
+    Args:
+        caps:
+            Python capsule with name ``"dltensor"`` expected to reference
+            an instance of ``DLManagedTensor`` struct.
+    Returns:
+        Instance of :class:`dpctl.tensor.usm_ndarray` with a view into
+        memory of the tensor. Capsule is renamed to ``"used_dltensor"``
+        upon success.
+    Raises:
+        TypeError:
+            if argument is not a ``"dltensor"`` capsule.
+        ValueError:
+            if argument is ``"used_dltensor"`` capsule
+        BufferError:
+            if the USM pointer is not bound to the reconstructed
+            sycl context, or the DLPack's device_type is not supported
+            by :mod:`dpctl`.
+    """
+    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
+    cdef DLManagedTensor *dlm_tensor = NULL
+    cdef DLTensor *dl_tensor = NULL
+    cdef int versioned = 0
+    cdef int readonly = 0
+    cdef bytes usm_type
+    cdef size_t sz = 1
+    cdef size_t alloc_sz = 1
+    cdef int i
+    cdef int device_id = -1
+    cdef int element_bytesize = 0
+    cdef Py_ssize_t offset_min = 0
+    cdef Py_ssize_t offset_max = 0
+    cdef char *mem_ptr = NULL
+    cdef Py_ssize_t mem_ptr_delta = 0
+    cdef Py_ssize_t element_offset = 0
+    cdef int64_t stride_i = -1
+    cdef int64_t shape_i = -1
+
+    if cpython.PyCapsule_IsValid(py_caps, "dltensor"):
+        dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
+                py_caps, "dltensor")
+        dl_tensor = &dlm_tensor.dl_tensor
+    elif cpython.PyCapsule_IsValid(py_caps, "dltensor_versioned"):
+        dlmv_tensor = <DLManagedTensorVersioned*>cpython.PyCapsule_GetPointer(
+                py_caps, "dltensor_versioned")
+        if dlmv_tensor.version.major > DLPACK_MAJOR_VERSION:
+            raise BufferError(
+                "Can not import DLPack tensor with major version "
+                f"greater than {DLPACK_MAJOR_VERSION}"
+            )
+        versioned = 1
+        readonly = (dlmv_tensor.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0
+        dl_tensor = &dlmv_tensor.dl_tensor
+    elif (
+        cpython.PyCapsule_IsValid(py_caps, "used_dltensor")
+        or cpython.PyCapsule_IsValid(py_caps, "used_dltensor_versioned")
+    ):
+        raise ValueError(
+            "A DLPack tensor object can not be consumed multiple times"
+        )
+    else:
+        raise TypeError(
+            "`from_dlpack_capsule` expects a Python 'dltensor' capsule"
+        )
+
+    # Verify that we can work with this device
+    if dl_tensor.device.device_type == kDLOneAPI:
+        device_id = dl_tensor.device.device_id
+        root_device = dpctl.SyclDevice(str(<int>device_id))
+        try:
+            default_context = root_device.sycl_platform.default_context
+        except RuntimeError:
+            default_context = get_device_cached_queue(root_device).sycl_context
+        if dl_tensor.data is NULL:
+            usm_type = b"device"
+            q = get_device_cached_queue((default_context, root_device,))
+        else:
+            usm_type = c_dpmem._Memory.get_pointer_type(
+                <DPCTLSyclUSMRef> dl_tensor.data,
+                <c_dpctl.SyclContext>default_context)
+            if usm_type == b"unknown":
+                raise BufferError(
+                    "Data pointer in DLPack is not bound to default sycl "
+                    f"context of device '{device_id}', translated to "
+                    f"{root_device.filter_string}"
+                )
+            alloc_device = c_dpmem._Memory.get_pointer_device(
+                <DPCTLSyclUSMRef> dl_tensor.data,
+                <c_dpctl.SyclContext>default_context
+            )
+            q = get_device_cached_queue((default_context, alloc_device,))
+        if dl_tensor.dtype.bits % 8:
+            raise BufferError(
+                "Can not import DLPack tensor whose element's "
+                "bitsize is not a multiple of 8"
+            )
+        if dl_tensor.dtype.lanes != 1:
+            raise BufferError(
+                "Can not import DLPack tensor with lanes != 1"
+            )
+        if dl_tensor.ndim > 0:
+            offset_min = 0
+            offset_max = 0
+            for i in range(dl_tensor.ndim):
+                stride_i = dl_tensor.strides[i]
+                shape_i = dl_tensor.shape[i]
+                if shape_i > 1:
+                    shape_i -= 1
+                    if stride_i > 0:
+                        offset_max = offset_max + stride_i * shape_i
+                    else:
+                        offset_min = offset_min + stride_i * shape_i
+            sz = offset_max - offset_min + 1
+        if sz == 0:
+            sz = 1
+
+        element_bytesize = (dl_tensor.dtype.bits // 8)
+        sz = sz * element_bytesize
+        element_offset = dl_tensor.byte_offset // element_bytesize
+
+        # transfer ownership
+        if not versioned:
+            dlm_holder = _DLManagedTensorOwner._create(dlm_tensor)
+            cpython.PyCapsule_SetName(py_caps, "used_dltensor")
+        else:
+            dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor)
+            cpython.PyCapsule_SetName(py_caps, "used_dltensor_versioned")
+
+        if dl_tensor.data is NULL:
+            usm_mem = dpmem.MemoryUSMDevice(sz, q)
+        else:
+            mem_ptr_delta = dl_tensor.byte_offset - (
+                element_offset * element_bytesize
+            )
+            mem_ptr = <char *>dl_tensor.data
+            alloc_sz = dl_tensor.byte_offset + <uint64_t>(
+                (offset_max + 1) * element_bytesize)
+            tmp = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+                <DPCTLSyclUSMRef> mem_ptr,
+                max(alloc_sz, <uint64_t>element_bytesize),
+                (<c_dpctl.SyclQueue>q).get_queue_ref(),
+                memory_owner=dlmv_holder if versioned else dlm_holder
+            )
+            if mem_ptr_delta == 0:
+                usm_mem = tmp
+            else:
+                alloc_sz = dl_tensor.byte_offset + <uint64_t>(
+                    (offset_max * element_bytesize + mem_ptr_delta))
+                usm_mem = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+                    <DPCTLSyclUSMRef> (
+                        mem_ptr + (element_bytesize - mem_ptr_delta)
+                    ),
+                    max(alloc_sz, <uint64_t>element_bytesize),
+                    (<c_dpctl.SyclQueue>q).get_queue_ref(),
+                    memory_owner=tmp
+                )
+
+        py_shape = list()
+        if (dl_tensor.shape is not NULL):
+            for i in range(dl_tensor.ndim):
+                py_shape.append(dl_tensor.shape[i])
+        if (dl_tensor.strides is not NULL):
+            py_strides = list()
+            for i in range(dl_tensor.ndim):
+                py_strides.append(dl_tensor.strides[i])
+        else:
+            py_strides = None
+        if (dl_tensor.dtype.code == kDLUInt):
+            ary_dt = np.dtype("u" + str(element_bytesize))
+        elif (dl_tensor.dtype.code == kDLInt):
+            ary_dt = np.dtype("i" + str(element_bytesize))
+        elif (dl_tensor.dtype.code == kDLFloat):
+            ary_dt = np.dtype("f" + str(element_bytesize))
+        elif (dl_tensor.dtype.code == kDLComplex):
+            ary_dt = np.dtype("c" + str(element_bytesize))
+        elif (dl_tensor.dtype.code == kDLBool):
+            ary_dt = np.dtype("?")
+        else:
+            raise BufferError(
+                "Can not import DLPack tensor with type code {}.".format(
+                    <object>dl_tensor.dtype.code
+                )
+            )
+        res_ary = usm_ndarray(
+            py_shape,
+            dtype=ary_dt,
+            buffer=usm_mem,
+            strides=py_strides,
+            offset=element_offset
+        )
+        if readonly:
+            res_ary.flags_ = (res_ary.flags_ & ~USM_ARRAY_WRITABLE)
+        return res_ary
+    elif _is_kdlcpu_device(&dl_tensor.device):
+        ary_iface = _numpy_array_interface_from_dl_tensor(dl_tensor, readonly)
+        if not versioned:
+            dlm_holder = _DLManagedTensorOwner._create(dlm_tensor)
+            cpython.PyCapsule_SetName(py_caps, "used_dltensor")
+            return np.ctypeslib.as_array(
+                _numpy_array_interface_wrapper(ary_iface, dlm_holder)
+            )
+        else:
+            dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor)
+            cpython.PyCapsule_SetName(py_caps, "used_dltensor_versioned")
+            return np.ctypeslib.as_array(
+                _numpy_array_interface_wrapper(ary_iface, dlmv_holder)
+            )
+    else:
+        raise BufferError(
+            "The DLPack tensor resides on unsupported device."
+        )
+
+cdef usm_ndarray _to_usm_ary_from_host_blob(object host_blob, dev : Device):
+    q = dev.sycl_queue
+    np_ary = np.asarray(host_blob)
+    dt = np_ary.dtype
+    if dt.char in "dD" and q.sycl_device.has_aspect_fp64 is False:
+        Xusm_dtype = (
+            "float32" if dt.char == "d" else "complex64"
+        )
+    else:
+        Xusm_dtype = dt
+    usm_mem = dpmem.MemoryUSMDevice(np_ary.nbytes, queue=q)
+    usm_ary = usm_ndarray(np_ary.shape, dtype=Xusm_dtype, buffer=usm_mem)
+    usm_mem.copy_from_host(np.reshape(np_ary.view(dtype="u1"), -1))
+    return usm_ary
+
+
+# only cdef to make it private
+cdef object _create_device(object device, object dl_device):
+    if isinstance(device, Device):
+        return device
+    elif isinstance(device, dpctl.SyclDevice):
+        return Device.create_device(device)
+    else:
+        root_device = dpctl.SyclDevice(str(<int>dl_device[1]))
+        return Device.create_device(root_device)
+
+
+def from_dlpack(x, /, *, device=None, copy=None):
+    """from_dlpack(x, /, *, device=None, copy=None)
+
+    Constructs :class:`dpctl.tensor.usm_ndarray` or :class:`numpy.ndarray`
+    instance from a Python object ``x`` that implements ``__dlpack__`` protocol.
+
+    Args:
+        x (object):
+            A Python object representing an array that supports
+            ``__dlpack__`` protocol.
+        device (
+            Optional[str, :class:`dpctl.SyclDevice`,
+            :class:`dpctl.SyclQueue`,
+            :class:`dpctl.tensor.Device`,
+            tuple([:class:`enum.IntEnum`, int])])
+        ):
+            Device where the output array is to be placed. ``device`` keyword
+            values can be:
+
+            * ``None``
+                The data remains on the same device.
+            * oneAPI filter selector string
+                SYCL device selected by :ref:`filter selector string
+                <filter_selector_string>`.
+            * :class:`dpctl.SyclDevice`
+                explicit SYCL device that must correspond to
+                a non-partitioned SYCL device.
+            * :class:`dpctl.SyclQueue`
+                implies SYCL device targeted by the SYCL queue.
+            * :class:`dpctl.tensor.Device`
+                implies SYCL device `device.sycl_queue`. The `Device` object
+                is obtained via :attr:`dpctl.tensor.usm_ndarray.device`.
+            * ``(device_type, device_id)``
+               2-tuple matching the format of the output of the
+               ``__dlpack_device__`` method: an integer enumerator representing
+               the device type followed by an integer representing the index of
+               the device. The only supported :class:`dpctl.tensor.DLDeviceType`
+               device types are ``"kDLCPU"`` and ``"kDLOneAPI"``.
+
+            Default: ``None``.
+
+        copy (bool, optional)
+            Boolean indicating whether or not to copy the input.
+
+            * If ``copy`` is ``True``, the input will always be
+              copied.
+            * If ``False``, a ``BufferError`` will be raised if a
+              copy is deemed necessary.
+            * If ``None``, a copy will be made only if deemed
+              necessary, otherwise, the existing memory buffer will
+              be reused.
+
+            Default: ``None``.
+
+    Returns:
+        Alternative[usm_ndarray, numpy.ndarray]:
+            An array containing the data in ``x``. When ``copy`` is
+            ``None`` or ``False``, this may be a view into the original
+            memory.
+
+            The type of the returned object
+            depends on where the data backing up input object ``x`` resides.
+            If it resides in a USM allocation on a SYCL device, the
+            type :class:`dpctl.tensor.usm_ndarray` is returned, otherwise if it
+            resides on ``"kDLCPU"`` device the type is :class:`numpy.ndarray`,
+            and otherwise an exception is raised.
+
+            .. note::
+
+                If the return type is :class:`dpctl.tensor.usm_ndarray`, the
+                associated SYCL queue is derived from the ``device`` keyword.
+                When ``device`` keyword value has type :class:`dpctl.SyclQueue`,
+                the explicit queue instance is used, when ``device`` keyword
+                value has type :class:`dpctl.tensor.Device`, the
+                ``device.sycl_queue`` is used. In all other cases, the cached
+                SYCL queue corresponding to the implied SYCL device is used.
+
+    Raises:
+        TypeError:
+            if ``x`` does not implement ``__dlpack__`` method
+        ValueError:
+            if data of the input object resides on an unsupported device
+
+    See https://dmlc.github.io/dlpack/latest/ for more details.
+
+    :Example:
+
+        .. code-block:: python
+
+            import dpctl
+            import dpnp.tensor as dpt
+
+            class Container:
+                "Helper class implementing `__dlpack__` protocol"
+                def __init__(self, array):
+                    self._array = array
+
+                def __dlpack__(self, stream=None):
+                    return self._array.__dlpack__(stream=stream)
+
+                def __dlpack_device__(self):
+                    return self._array.__dlpack_device__()
+
+            C = Container(dpt.linspace(0, 100, num=20, dtype="int16"))
+            # create usm_ndarray view
+            X = dpt.from_dlpack(C)
+            # migrate content of the container to device of type kDLCPU
+            Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0))
+
+    """
+    dlpack_attr = getattr(x, "__dlpack__", None)
+    dlpack_dev_attr = getattr(x, "__dlpack_device__", None)
+    if not callable(dlpack_attr) or not callable(dlpack_dev_attr):
+        raise TypeError(
+            f"The argument of type {type(x)} does not implement "
+            "`__dlpack__` and `__dlpack_device__` methods."
+        )
+    # device is converted to a dlpack_device if necessary
+    dl_device = None
+    if device:
+        if isinstance(device, tuple):
+            dl_device = device
+            if len(dl_device) != 2:
+                raise ValueError(
+                    "Argument `device` specified as a tuple must have length 2"
+                )
+        else:
+            if not isinstance(device, dpctl.SyclDevice):
+                device = Device.create_device(device)
+                d = device.sycl_device
+            else:
+                d = device
+            dl_device = (device_OneAPI, d.get_device_id())
+    if dl_device is not None:
+        if (dl_device[0] not in [device_OneAPI, device_CPU]):
+            raise ValueError(
+                f"Argument `device`={device} is not supported."
+            )
+    got_type_error = False
+    got_buffer_error = False
+    got_other_error = False
+    saved_exception = None
+    # First DLPack version supporting dl_device, and copy
+    requested_ver = (1, 0)
+    cpu_dev = (device_CPU, 0)
+    try:
+        # setting max_version to minimal version that supports
+        # dl_device/copy keywords
+        dlpack_capsule = dlpack_attr(
+            max_version=requested_ver,
+            dl_device=dl_device,
+            copy=copy
+        )
+    except TypeError:
+        # exporter does not support max_version keyword
+        got_type_error = True
+    except (BufferError, NotImplementedError, ValueError) as e:
+        # Either dl_device, or copy cannot be satisfied
+        got_buffer_error = True
+        saved_exception = e
+    except Exception as e:
+        got_other_error = True
+        saved_exception = e
+    else:
+        # execution did not raise exceptions
+        return from_dlpack_capsule(dlpack_capsule)
+    finally:
+        if got_type_error:
+            # max_version/dl_device, copy keywords are not supported
+            # by __dlpack__
+            x_dldev = dlpack_dev_attr()
+            if (dl_device is None) or (dl_device == x_dldev):
+                dlpack_capsule = dlpack_attr()
+                return from_dlpack_capsule(dlpack_capsule)
+            # must copy via host
+            if copy is False:
+                raise BufferError(
+                    "Importing data via DLPack requires copying, but "
+                    "copy=False was provided"
+                )
+            # when max_version/dl_device/copy are not supported
+            # we can only support importing to OneAPI devices
+            # from host, or from another oneAPI device
+            is_supported_x_dldev = (
+                x_dldev == cpu_dev or
+                (x_dldev[0] == device_OneAPI)
+            )
+            is_supported_dl_device = (
+                dl_device == cpu_dev or
+                dl_device[0] == device_OneAPI
+            )
+            if is_supported_x_dldev and is_supported_dl_device:
+                dlpack_capsule = dlpack_attr()
+                blob = from_dlpack_capsule(dlpack_capsule)
+            else:
+                raise BufferError(
+                    f"Can not import to requested device {dl_device}"
+                )
+            dev = _create_device(device, dl_device)
+            if x_dldev == cpu_dev and dl_device == cpu_dev:
+                # both source and destination are CPU
+                return blob
+            elif x_dldev == cpu_dev:
+                # source is CPU, destination is oneAPI
+                return _to_usm_ary_from_host_blob(blob, dev)
+            elif dl_device == cpu_dev:
+                # source is oneAPI, destination is CPU
+                cpu_caps = blob.__dlpack__(
+                    max_version=get_build_dlpack_version(),
+                    dl_device=cpu_dev
+                )
+                return from_dlpack_capsule(cpu_caps)
+            else:
+                import dpnp.tensor as dpt
+                return dpt.asarray(blob, device=dev)
+        elif got_buffer_error:
+            # we are here, because dlpack_attr could not deal with requested
+            # dl_device, or copying was required
+            if copy is False:
+                raise BufferError(
+                    "Importing data via DLPack requires copying, but "
+                    "copy=False was provided"
+                )
+            if dl_device is None:
+                raise saved_exception
+            # must copy via host
+            if dl_device[0] != device_OneAPI:
+                raise BufferError(
+                    f"Can not import to requested device {dl_device}"
+                )
+            x_dldev = dlpack_dev_attr()
+            if x_dldev == cpu_dev:
+                dlpack_capsule = dlpack_attr()
+                host_blob = from_dlpack_capsule(dlpack_capsule)
+            else:
+                dlpack_capsule = dlpack_attr(
+                    max_version=requested_ver,
+                    dl_device=cpu_dev,
+                    copy=copy
+                )
+                host_blob = from_dlpack_capsule(dlpack_capsule)
+            dev = _create_device(device, dl_device)
+            return _to_usm_ary_from_host_blob(host_blob, dev)
+        elif got_other_error:
+            raise saved_exception
diff --git a/dpnp/tensor/_elementwise_common.py b/dpnp/tensor/_elementwise_common.py
new file mode 100644
index 000000000000..2eb89b8fb5f8
--- /dev/null
+++ b/dpnp/tensor/_elementwise_common.py
@@ -0,0 +1,988 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+
+from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
+from ._manipulation_functions import _broadcast_shape_impl
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
+from ._type_utils import (
+    _acceptance_fn_default_binary,
+    _acceptance_fn_default_unary,
+    _all_data_types,
+    _find_buf_dtype,
+    _find_buf_dtype2,
+    _find_buf_dtype_in_place_op,
+    _resolve_weak_types,
+)
+
+
+class UnaryElementwiseFunc:
+    """
+    Class that implements unary element-wise functions.
+
+    Args:
+        name (str):
+            Name of the unary function
+        result_type_resovler_fn (callable):
+            Function that takes dtype of the input and
+            returns the dtype of the result if the
+            implementation functions supports it, or
+            returns `None` otherwise.
+        unary_dp_impl_fn (callable):
+            Data-parallel implementation function with signature
+            `impl_fn(src: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src` is the argument array, `dst` is the
+            array to be populated with function values, effectively
+            evaluating `dst = func(src)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including lifetime management of argument Python objects to ensure
+            that their associated USM allocation is not freed before offloaded
+            computational tasks complete execution, while the second event
+            corresponds to computational tasks associated with function
+            evaluation.
+        acceptance_fn (callable, optional):
+            Function to influence type promotion behavior of this unary
+            function. The function takes 4 arguments:
+                arg_dtype - Data type of the first argument
+                buf_dtype - Data type the argument would be cast to
+                res_dtype - Data type of the output array with function values
+                sycl_dev - The :class:`dpctl.SyclDevice` where the function
+                    evaluation is carried out.
+            The function is invoked when the argument of the unary function
+            requires casting, e.g. the argument of `dpctl.tensor.log` is an
+            array with integral data type.
+        docs (str):
+            Documentation string for the unary function.
+    """
+
+    def __init__(
+        self,
+        name,
+        result_type_resolver_fn,
+        unary_dp_impl_fn,
+        docs,
+        acceptance_fn=None,
+    ):
+        self.__name__ = "UnaryElementwiseFunc"
+        self.name_ = name
+        self.result_type_resolver_fn_ = result_type_resolver_fn
+        self.types_ = None
+        self.unary_fn_ = unary_dp_impl_fn
+        self.__doc__ = docs
+        if callable(acceptance_fn):
+            self.acceptance_fn_ = acceptance_fn
+        else:
+            self.acceptance_fn_ = _acceptance_fn_default_unary
+
+    def __str__(self):
+        return f"<{self.__name__} '{self.name_}'>"
+
+    def __repr__(self):
+        return f"<{self.__name__} '{self.name_}'>"
+
+    def get_implementation_function(self):
+        """Returns the implementation function for
+        this elementwise unary function.
+
+        """
+        return self.unary_fn_
+
+    def get_type_result_resolver_function(self):
+        """Returns the type resolver function for this
+        elementwise unary function.
+        """
+        return self.result_type_resolver_fn_
+
+    def get_type_promotion_path_acceptance_function(self):
+        """Returns the acceptance function for this
+        elementwise binary function.
+
+        Acceptance function influences the type promotion
+        behavior of this unary function.
+        The function takes 4 arguments:
+            arg_dtype - Data type of the first argument
+            buf_dtype - Data type the argument would be cast to
+            res_dtype - Data type of the output array with function values
+            sycl_dev - The :class:`dpctl.SyclDevice` where the function
+                evaluation is carried out.
+        The function is invoked when the argument of the unary function
+        requires casting, e.g. the argument of `dpctl.tensor.log` is an
+        array with integral data type.
+        """
+        return self.acceptance_fn_
+
+    @property
+    def nin(self):
+        """Returns the number of arguments treated as inputs."""
+        return 1
+
+    @property
+    def nout(self):
+        """Returns the number of arguments treated as outputs."""
+        return 1
+
+    @property
+    def types(self):
+        """Returns information about types supported by
+        implementation function, using NumPy's character
+        encoding for data types, e.g.
+
+        :Example:
+            .. code-block:: python
+
+                dpctl.tensor.sin.types
+                # Outputs: ['e->e', 'f->f', 'd->d', 'F->F', 'D->D']
+        """
+        types = self.types_
+        if not types:
+            types = []
+            for dt1 in _all_data_types(True, True):
+                dt2 = self.result_type_resolver_fn_(dt1)
+                if dt2:
+                    types.append(f"{dt1.char}->{dt2.char}")
+            self.types_ = types
+        return types
+
+    def __call__(self, x, /, *, out=None, order="K"):
+        if not isinstance(x, dpt.usm_ndarray):
+            raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+
+        if order not in ["C", "F", "K", "A"]:
+            order = "K"
+        buf_dt, res_dt = _find_buf_dtype(
+            x.dtype,
+            self.result_type_resolver_fn_,
+            x.sycl_device,
+            acceptance_fn=self.acceptance_fn_,
+        )
+        if res_dt is None:
+            raise ValueError(
+                f"function '{self.name_}' does not support input type "
+                f"({x.dtype}), "
+                "and the input could not be safely coerced to any "
+                "supported types according to the casting rule ''safe''."
+            )
+
+        orig_out = out
+        if out is not None:
+            if not isinstance(out, dpt.usm_ndarray):
+                raise TypeError(
+                    f"output array must be of usm_ndarray type, got {type(out)}"
+                )
+
+            if not out.flags.writable:
+                raise ValueError("provided `out` array is read-only")
+
+            if out.shape != x.shape:
+                raise ValueError(
+                    "The shape of input and output arrays are inconsistent. "
+                    f"Expected output shape is {x.shape}, got {out.shape}"
+                )
+
+            if res_dt != out.dtype:
+                raise ValueError(
+                    f"Output array of type {res_dt} is needed, "
+                    f"got {out.dtype}"
+                )
+
+            if (
+                buf_dt is None
+                and ti._array_overlap(x, out)
+                and not ti._same_logical_tensors(x, out)
+            ):
+                # Allocate a temporary buffer to avoid memory overlapping.
+                # Note if `buf_dt` is not None, a temporary copy of `x` will be
+                # created, so the array overlap check isn't needed.
+                out = dpt.empty_like(out)
+
+            if dpt.get_execution_queue((x.sycl_queue, out.sycl_queue)) is None:
+                raise dpt.ExecutionPlacementError(
+                    "Input and output allocation queues are not compatible"
+                )
+
+        exec_q = x.sycl_queue
+        _manager = SequentialOrderManager[exec_q]
+        if buf_dt is None:
+            if out is None:
+                if order == "K":
+                    out = _empty_like_orderK(x, res_dt)
+                else:
+                    if order == "A":
+                        order = "F" if x.flags.f_contiguous else "C"
+                    out = dpt.empty_like(x, dtype=res_dt, order=order)
+
+            dep_evs = _manager.submitted_events
+            ht_unary_ev, unary_ev = self.unary_fn_(
+                x, out, sycl_queue=exec_q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_unary_ev, unary_ev)
+
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out, dst=orig_out, sycl_queue=exec_q, depends=[unary_ev]
+                )
+                _manager.add_event_pair(ht_copy_ev, cpy_ev)
+                out = orig_out
+
+            return out
+
+        if order == "K":
+            buf = _empty_like_orderK(x, buf_dt)
+        else:
+            if order == "A":
+                order = "F" if x.flags.f_contiguous else "C"
+            buf = dpt.empty_like(x, dtype=buf_dt, order=order)
+
+        dep_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_orderK(buf, res_dt)
+            else:
+                out = dpt.empty_like(buf, dtype=res_dt, order=order)
+
+        ht, uf_ev = self.unary_fn_(
+            buf, out, sycl_queue=exec_q, depends=[copy_ev]
+        )
+        _manager.add_event_pair(ht, uf_ev)
+
+        return out
+
+
+class BinaryElementwiseFunc:
+    """
+    Class that implements binary element-wise functions.
+
+    Args:
+        name (str):
+            Name of the unary function
+        result_type_resovle_fn (callable):
+            Function that takes dtypes of the input and
+            returns the dtype of the result if the
+            implementation functions supports it, or
+            returns `None` otherwise.
+        binary_dp_impl_fn (callable):
+            Data-parallel implementation function with signature
+            `impl_fn(src1: usm_ndarray, src2: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src1` and `src2` are the argument arrays, `dst` is the
+            array to be populated with function values,
+            i.e. `dst=func(src1, src2)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including lifetime management of argument Python objects to ensure
+            that their associated USM allocation is not freed before offloaded
+            computational tasks complete execution, while the second event
+            corresponds to computational tasks associated with function
+            evaluation.
+        docs (str):
+            Documentation string for the unary function.
+        binary_inplace_fn (callable, optional):
+            Data-parallel implementation function with signature
+            `impl_fn(src: usm_ndarray, dst: usm_ndarray,
+             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
+            where the `src` is the argument array, `dst` is the
+            array to be populated with function values,
+            i.e. `dst=func(dst, src)`.
+            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
+            The first event corresponds to data-management host tasks,
+            including async lifetime management of Python arguments,
+            while the second event corresponds to computational tasks
+            associated with function evaluation.
+        acceptance_fn (callable, optional):
+            Function to influence type promotion behavior of this binary
+            function. The function takes 6 arguments:
+                arg1_dtype - Data type of the first argument
+                arg2_dtype - Data type of the second argument
+                ret_buf1_dtype - Data type the first argument would be cast to
+                ret_buf2_dtype - Data type the second argument would be cast to
+                res_dtype - Data type of the output array with function values
+                sycl_dev - The :class:`dpctl.SyclDevice` where the function
+                    evaluation is carried out.
+            The function is only called when both arguments of the binary
+            function require casting, e.g. both arguments of
+            `dpctl.tensor.logaddexp` are arrays with integral data type.
+    """
+
+    def __init__(
+        self,
+        name,
+        result_type_resolver_fn,
+        binary_dp_impl_fn,
+        docs,
+        binary_inplace_fn=None,
+        acceptance_fn=None,
+        weak_type_resolver=None,
+    ):
+        self.__name__ = "BinaryElementwiseFunc"
+        self.name_ = name
+        self.result_type_resolver_fn_ = result_type_resolver_fn
+        self.types_ = None
+        self.binary_fn_ = binary_dp_impl_fn
+        self.binary_inplace_fn_ = binary_inplace_fn
+        self.__doc__ = docs
+        if callable(acceptance_fn):
+            self.acceptance_fn_ = acceptance_fn
+        else:
+            self.acceptance_fn_ = _acceptance_fn_default_binary
+        if callable(weak_type_resolver):
+            self.weak_type_resolver_ = weak_type_resolver
+        else:
+            self.weak_type_resolver_ = _resolve_weak_types
+
+    def __str__(self):
+        return f"<{self.__name__} '{self.name_}'>"
+
+    def __repr__(self):
+        return f"<{self.__name__} '{self.name_}'>"
+
+    def get_implementation_function(self):
+        """Returns the out-of-place implementation
+        function for this elementwise binary function.
+
+        """
+        return self.binary_fn_
+
+    def get_implementation_inplace_function(self):
+        """Returns the in-place implementation
+        function for this elementwise binary function.
+
+        """
+        return self.binary_inplace_fn_
+
+    def get_type_result_resolver_function(self):
+        """Returns the type resolver function for this
+        elementwise binary function.
+        """
+        return self.result_type_resolver_fn_
+
+    def get_type_promotion_path_acceptance_function(self):
+        """Returns the acceptance function for this
+        elementwise binary function.
+
+        Acceptance function influences the type promotion
+        behavior of this binary function.
+        The function takes 6 arguments:
+            arg1_dtype - Data type of the first argument
+            arg2_dtype - Data type of the second argument
+            ret_buf1_dtype - Data type the first argument would be cast to
+            ret_buf2_dtype - Data type the second argument would be cast to
+            res_dtype - Data type of the output array with function values
+            sycl_dev - :class:`dpctl.SyclDevice` on which function evaluation
+                is carried out.
+
+        The acceptance function is only invoked if both input arrays must be
+        cast to intermediary data types, as would happen during call of
+        `dpctl.tensor.hypot` with both arrays being of integral data type.
+        """
+        return self.acceptance_fn_
+
+    def get_array_dtype_scalar_type_resolver_function(self):
+        """Returns the function which determines how to treat
+        Python scalar types for this elementwise binary function.
+
+        Resolver influences what type the scalar will be
+        treated as prior to type promotion behavior.
+        The function takes 3 arguments:
+
+        Args:
+            o1_dtype (object, dtype):
+                A class representing a Python scalar type or a ``dtype``
+            o2_dtype (object, dtype):
+                A class representing a Python scalar type or a ``dtype``
+            sycl_dev (:class:`dpctl.SyclDevice`):
+                Device on which function evaluation is carried out.
+
+        One of ``o1_dtype`` and ``o2_dtype`` must be a ``dtype`` instance.
+        """
+        return self.weak_type_resolver_
+
+    @property
+    def nin(self):
+        """Returns the number of arguments treated as inputs."""
+        return 2
+
+    @property
+    def nout(self):
+        """Returns the number of arguments treated as outputs."""
+        return 1
+
+    @property
+    def types(self):
+        """Returns information about types supported by
+        implementation function, using NumPy's character
+        encoding for data types, e.g.
+
+        :Example:
+            .. code-block:: python
+
+                dpctl.tensor.divide.types
+                # Outputs: ['ee->e', 'ff->f', 'fF->F', 'dd->d', 'dD->D',
+                #    'Ff->F', 'FF->F', 'Dd->D', 'DD->D']
+        """
+        types = self.types_
+        if not types:
+            types = []
+            _all_dtypes = _all_data_types(True, True)
+            for dt1 in _all_dtypes:
+                for dt2 in _all_dtypes:
+                    dt3 = self.result_type_resolver_fn_(dt1, dt2)
+                    if dt3:
+                        types.append(f"{dt1.char}{dt2.char}->{dt3.char}")
+            self.types_ = types
+        return types
+
+    def __call__(self, o1, o2, /, *, out=None, order="K"):
+        if order not in ["K", "C", "F", "A"]:
+            order = "K"
+        q1, o1_usm_type = _get_queue_usm_type(o1)
+        q2, o2_usm_type = _get_queue_usm_type(o2)
+        if q1 is None and q2 is None:
+            raise dpt.ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments. "
+                "One of the arguments must represent USM allocation and "
+                "expose `__sycl_usm_array_interface__` property"
+            )
+        if q1 is None:
+            exec_q = q2
+            res_usm_type = o2_usm_type
+        elif q2 is None:
+            exec_q = q1
+            res_usm_type = o1_usm_type
+        else:
+            exec_q = dpt.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise dpt.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpt.get_coerced_usm_type(
+                (
+                    o1_usm_type,
+                    o2_usm_type,
+                )
+            )
+        dpt.validate_usm_type(res_usm_type, allow_none=False)
+        o1_shape = _get_shape(o1)
+        o2_shape = _get_shape(o2)
+        if not all(
+            isinstance(s, (tuple, list))
+            for s in (
+                o1_shape,
+                o2_shape,
+            )
+        ):
+            raise TypeError(
+                "Shape of arguments can not be inferred. "
+                "Arguments are expected to be "
+                "lists, tuples, or both"
+            )
+        try:
+            res_shape = _broadcast_shape_impl(
+                [
+                    o1_shape,
+                    o2_shape,
+                ]
+            )
+        except ValueError:
+            raise ValueError(
+                "operands could not be broadcast together with shapes "
+                f"{o1_shape} and {o2_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        o1_dtype = _get_dtype(o1, sycl_dev)
+        o2_dtype = _get_dtype(o2, sycl_dev)
+        if not all(_validate_dtype(o) for o in (o1_dtype, o2_dtype)):
+            raise ValueError("Operands have unsupported data types")
+
+        o1_dtype, o2_dtype = self.weak_type_resolver_(
+            o1_dtype, o2_dtype, sycl_dev
+        )
+
+        buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
+            o1_dtype,
+            o2_dtype,
+            self.result_type_resolver_fn_,
+            sycl_dev,
+            acceptance_fn=self.acceptance_fn_,
+        )
+
+        if res_dt is None:
+            raise ValueError(
+                f"function '{self.name_}' does not support input types "
+                f"({o1_dtype}, {o2_dtype}), "
+                "and the inputs could not be safely coerced to any "
+                "supported types according to the casting rule ''safe''."
+            )
+
+        orig_out = out
+        _manager = SequentialOrderManager[exec_q]
+        if out is not None:
+            if not isinstance(out, dpt.usm_ndarray):
+                raise TypeError(
+                    f"output array must be of usm_ndarray type, got {type(out)}"
+                )
+
+            if not out.flags.writable:
+                raise ValueError("provided `out` array is read-only")
+
+            if out.shape != res_shape:
+                raise ValueError(
+                    "The shape of input and output arrays are inconsistent. "
+                    f"Expected output shape is {res_shape}, got {out.shape}"
+                )
+
+            if res_dt != out.dtype:
+                raise ValueError(
+                    f"Output array of type {res_dt} is needed, "
+                    f"got {out.dtype}"
+                )
+
+            if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+                raise dpt.ExecutionPlacementError(
+                    "Input and output allocation queues are not compatible"
+                )
+
+            if isinstance(o1, dpt.usm_ndarray):
+                if ti._array_overlap(o1, out) and buf1_dt is None:
+                    if not ti._same_logical_tensors(o1, out):
+                        out = dpt.empty_like(out)
+                    elif self.binary_inplace_fn_ is not None:
+                        # if there is a dedicated in-place kernel
+                        # it can be called here, otherwise continues
+                        if isinstance(o2, dpt.usm_ndarray):
+                            src2 = o2
+                            if (
+                                ti._array_overlap(o2, out)
+                                and not ti._same_logical_tensors(o2, out)
+                                and buf2_dt is None
+                            ):
+                                buf2_dt = o2_dtype
+                        else:
+                            src2 = dpt.asarray(
+                                o2, dtype=o2_dtype, sycl_queue=exec_q
+                            )
+                        if buf2_dt is None:
+                            if src2.shape != res_shape:
+                                src2 = dpt.broadcast_to(src2, res_shape)
+                            dep_evs = _manager.submitted_events
+                            ht_, comp_ev = self.binary_inplace_fn_(
+                                lhs=o1,
+                                rhs=src2,
+                                sycl_queue=exec_q,
+                                depends=dep_evs,
+                            )
+                            _manager.add_event_pair(ht_, comp_ev)
+                        else:
+                            buf2 = dpt.empty_like(src2, dtype=buf2_dt)
+                            dep_evs = _manager.submitted_events
+                            (
+                                ht_copy_ev,
+                                copy_ev,
+                            ) = ti._copy_usm_ndarray_into_usm_ndarray(
+                                src=src2,
+                                dst=buf2,
+                                sycl_queue=exec_q,
+                                depends=dep_evs,
+                            )
+                            _manager.add_event_pair(ht_copy_ev, copy_ev)
+
+                            buf2 = dpt.broadcast_to(buf2, res_shape)
+                            ht_, bf_ev = self.binary_inplace_fn_(
+                                lhs=o1,
+                                rhs=buf2,
+                                sycl_queue=exec_q,
+                                depends=[copy_ev],
+                            )
+                            _manager.add_event_pair(ht_, bf_ev)
+
+                        return out
+
+            if isinstance(o2, dpt.usm_ndarray):
+                if (
+                    ti._array_overlap(o2, out)
+                    and not ti._same_logical_tensors(o2, out)
+                    and buf2_dt is None
+                ):
+                    # should not reach if out is reallocated
+                    # after being checked against o1
+                    out = dpt.empty_like(out)
+
+        if isinstance(o1, dpt.usm_ndarray):
+            src1 = o1
+        else:
+            src1 = dpt.asarray(o1, dtype=o1_dtype, sycl_queue=exec_q)
+        if isinstance(o2, dpt.usm_ndarray):
+            src2 = o2
+        else:
+            src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
+
+        if order == "A":
+            order = (
+                "F"
+                if all(
+                    arr.flags.f_contiguous
+                    for arr in (
+                        src1,
+                        src2,
+                    )
+                )
+                else "C"
+            )
+
+        if buf1_dt is None and buf2_dt is None:
+            if out is None:
+                if order == "K":
+                    out = _empty_like_pair_orderK(
+                        src1, src2, res_dt, res_shape, res_usm_type, exec_q
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+            if src1.shape != res_shape:
+                src1 = dpt.broadcast_to(src1, res_shape)
+            if src2.shape != res_shape:
+                src2 = dpt.broadcast_to(src2, res_shape)
+            deps_ev = _manager.submitted_events
+            ht_binary_ev, binary_ev = self.binary_fn_(
+                src1=src1,
+                src2=src2,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=deps_ev,
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+        elif buf1_dt is None:
+            if order == "K":
+                buf2 = _empty_like_orderK(src2, buf2_dt)
+            else:
+                buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order)
+            dep_evs = _manager.submitted_events
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+            if out is None:
+                if order == "K":
+                    out = _empty_like_pair_orderK(
+                        src1, buf2, res_dt, res_shape, res_usm_type, exec_q
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            if src1.shape != res_shape:
+                src1 = dpt.broadcast_to(src1, res_shape)
+            buf2 = dpt.broadcast_to(buf2, res_shape)
+            ht_binary_ev, binary_ev = self.binary_fn_(
+                src1=src1,
+                src2=buf2,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+        elif buf2_dt is None:
+            if order == "K":
+                buf1 = _empty_like_orderK(src1, buf1_dt)
+            else:
+                buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order)
+            dep_evs = _manager.submitted_events
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+            if out is None:
+                if order == "K":
+                    out = _empty_like_pair_orderK(
+                        buf1, src2, res_dt, res_shape, res_usm_type, exec_q
+                    )
+                else:
+                    out = dpt.empty(
+                        res_shape,
+                        dtype=res_dt,
+                        usm_type=res_usm_type,
+                        sycl_queue=exec_q,
+                        order=order,
+                    )
+
+            buf1 = dpt.broadcast_to(buf1, res_shape)
+            if src2.shape != res_shape:
+                src2 = dpt.broadcast_to(src2, res_shape)
+            ht_binary_ev, binary_ev = self.binary_fn_(
+                src1=buf1,
+                src2=src2,
+                dst=out,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_binary_ev, binary_ev)
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+                out = orig_out
+            return out
+
+        if order == "K":
+            if src1.flags.c_contiguous and src2.flags.c_contiguous:
+                order = "C"
+            elif src1.flags.f_contiguous and src2.flags.f_contiguous:
+                order = "F"
+        if order == "K":
+            buf1 = _empty_like_orderK(src1, buf1_dt)
+        else:
+            buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order)
+        dep_evs = _manager.submitted_events
+        ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+        if order == "K":
+            buf2 = _empty_like_orderK(src2, buf2_dt)
+        else:
+            buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order)
+        ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        buf1 = dpt.broadcast_to(buf1, res_shape)
+        buf2 = dpt.broadcast_to(buf2, res_shape)
+        ht_, bf_ev = self.binary_fn_(
+            src1=buf1,
+            src2=buf2,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy1_ev, copy2_ev],
+        )
+        _manager.add_event_pair(ht_, bf_ev)
+        return out
+
+    def _inplace_op(self, o1, o2):
+        if self.binary_inplace_fn_ is None:
+            raise ValueError(
+                "binary function does not have a dedicated in-place "
+                "implementation"
+            )
+        if not isinstance(o1, dpt.usm_ndarray):
+            raise TypeError(
+                "Expected first argument to be "
+                f"dpnp.tensor.usm_ndarray, got {type(o1)}"
+            )
+        if not o1.flags.writable:
+            raise ValueError("provided left-hand side array is read-only")
+        q1, o1_usm_type = o1.sycl_queue, o1.usm_type
+        q2, o2_usm_type = _get_queue_usm_type(o2)
+        if q2 is None:
+            exec_q = q1
+            res_usm_type = o1_usm_type
+        else:
+            exec_q = dpt.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise dpt.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            res_usm_type = dpt.get_coerced_usm_type(
+                (
+                    o1_usm_type,
+                    o2_usm_type,
+                )
+            )
+        dpt.validate_usm_type(res_usm_type, allow_none=False)
+        o1_shape = o1.shape
+        o2_shape = _get_shape(o2)
+        if not isinstance(o2_shape, (tuple, list)):
+            raise TypeError(
+                "Shape of second argument can not be inferred. "
+                "Expected list or tuple."
+            )
+        try:
+            res_shape = _broadcast_shape_impl(
+                [
+                    o1_shape,
+                    o2_shape,
+                ]
+            )
+        except ValueError:
+            raise ValueError(
+                "operands could not be broadcast together with shapes "
+                f"{o1_shape} and {o2_shape}"
+            )
+
+        if res_shape != o1_shape:
+            raise ValueError(
+                "The shape of the non-broadcastable left-hand "
+                f"side {o1_shape} is inconsistent with the "
+                f"broadcast shape {res_shape}."
+            )
+
+        sycl_dev = exec_q.sycl_device
+        o1_dtype = o1.dtype
+        o2_dtype = _get_dtype(o2, sycl_dev)
+        if not _validate_dtype(o2_dtype):
+            raise ValueError("Operand has an unsupported data type")
+
+        o1_dtype, o2_dtype = self.weak_type_resolver_(
+            o1_dtype, o2_dtype, sycl_dev
+        )
+
+        buf_dt, res_dt = _find_buf_dtype_in_place_op(
+            o1_dtype,
+            o2_dtype,
+            self.result_type_resolver_fn_,
+            sycl_dev,
+        )
+
+        if res_dt is None:
+            raise ValueError(
+                f"function '{self.name_}' does not support input types "
+                f"({o1_dtype}, {o2_dtype}), "
+                "and the inputs could not be safely coerced to any "
+                "supported types according to the casting rule "
+                "''same_kind''."
+            )
+
+        if res_dt != o1_dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, " f"got {o1_dtype}"
+            )
+
+        _manager = SequentialOrderManager[exec_q]
+        if isinstance(o2, dpt.usm_ndarray):
+            src2 = o2
+            if (
+                ti._array_overlap(o2, o1)
+                and not ti._same_logical_tensors(o2, o1)
+                and buf_dt is None
+            ):
+                buf_dt = o2_dtype
+        else:
+            src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
+        if buf_dt is None:
+            if src2.shape != res_shape:
+                src2 = dpt.broadcast_to(src2, res_shape)
+            dep_evs = _manager.submitted_events
+            ht_, comp_ev = self.binary_inplace_fn_(
+                lhs=o1,
+                rhs=src2,
+                sycl_queue=exec_q,
+                depends=dep_evs,
+            )
+            _manager.add_event_pair(ht_, comp_ev)
+        else:
+            buf = dpt.empty_like(src2, dtype=buf_dt)
+            dep_evs = _manager.submitted_events
+            (
+                ht_copy_ev,
+                copy_ev,
+            ) = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=src2,
+                dst=buf,
+                sycl_queue=exec_q,
+                depends=dep_evs,
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+
+            buf = dpt.broadcast_to(buf, res_shape)
+            ht_, bf_ev = self.binary_inplace_fn_(
+                lhs=o1,
+                rhs=buf,
+                sycl_queue=exec_q,
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_, bf_ev)
+
+        return o1
diff --git a/dpnp/tensor/_elementwise_funcs.py b/dpnp/tensor/_elementwise_funcs.py
new file mode 100644
index 000000000000..4040f33bf38e
--- /dev/null
+++ b/dpnp/tensor/_elementwise_funcs.py
@@ -0,0 +1,2276 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpnp.tensor._tensor_elementwise_impl as ti
+
+from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc
+from ._type_utils import (
+    _acceptance_fn_divide,
+    _acceptance_fn_negative,
+    _acceptance_fn_reciprocal,
+    _acceptance_fn_round,
+    _acceptance_fn_subtract,
+    _resolve_weak_types_all_py_ints,
+)
+
+# U01: ==== ABS    (x)
+_abs_docstring_ = r"""
+abs(x, /, \*, out=None, order='K')
+
+Calculates the absolute value for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array,
+        if parameter `out` is ``None``.
+        Default: `"K"`.
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise absolute values.
+        For complex input, the absolute value is its magnitude.
+        If `x` has a real-valued data type, the returned array has the
+        same data type as `x`. If `x` has a complex floating-point data type,
+        the returned array has a real-valued floating-point data type whose
+        precision matches the precision of `x`.
+"""
+
+abs = UnaryElementwiseFunc("abs", ti._abs_result_type, ti._abs, _abs_docstring_)
+del _abs_docstring_
+
+# U02: ==== ACOS   (x)
+_acos_docstring = r"""
+acos(x, /, \*, out=None, order='K')
+
+Computes inverse cosine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise inverse cosine, in radians
+        and in the closed interval :math:`[0, \pi]`. The data type of the
+        returned array is determined by the Type Promotion Rules.
+"""
+
+acos = UnaryElementwiseFunc(
+    "acos", ti._acos_result_type, ti._acos, _acos_docstring
+)
+del _acos_docstring
+
+# U03: ===== ACOSH (x)
+_acosh_docstring = r"""
+acosh(x, /, \*, out=None, order='K')
+
+Computes inverse hyperbolic cosine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise inverse hyperbolic cosine, in
+        radians and in the half-closed interval :math:`[0, \infty)`. The data
+        type of the returned array is determined by the Type Promotion Rules.
+"""
+
+acosh = UnaryElementwiseFunc(
+    "acosh", ti._acosh_result_type, ti._acosh, _acosh_docstring
+)
+del _acosh_docstring
+
+# B01: ===== ADD   (x1, x2)
+
+_add_docstring_ = r"""
+add(x1, x2, /, \*, out=None, order='K')
+
+Calculates the sum for each element `x1_i` of the input array `x1` with
+the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise sums. The data type of the
+        returned array is determined by the Type Promotion Rules.
+"""
+add = BinaryElementwiseFunc(
+    "add",
+    ti._add_result_type,
+    ti._add,
+    _add_docstring_,
+    binary_inplace_fn=ti._add_inplace,
+)
+del _add_docstring_
+
+# U04: ===== ASIN  (x)
+_asin_docstring = r"""
+asin(x, /, \*, out=None, order='K')
+
+Computes inverse sine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise inverse sine, in radians
+        and in the closed interval :math:`[-\pi/2, \pi/2]`. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+asin = UnaryElementwiseFunc(
+    "asin", ti._asin_result_type, ti._asin, _asin_docstring
+)
+del _asin_docstring
+
+# U05: ===== ASINH (x)
+_asinh_docstring = r"""
+asinh(x, /, \*, out=None, order='K')
+
+Computes inverse hyperbolic sine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise inverse hyperbolic sine, in
+        radians. The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+asinh = UnaryElementwiseFunc(
+    "asinh", ti._asinh_result_type, ti._asinh, _asinh_docstring
+)
+del _asinh_docstring
+
+# U06: ===== ATAN  (x)
+_atan_docstring = r"""
+atan(x, /, \*, out=None, order='K')
+
+Computes inverse tangent for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise inverse tangent, in radians
+        and in the closed interval :math:`[-\pi/2, \pi/2]`. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+atan = UnaryElementwiseFunc(
+    "atan", ti._atan_result_type, ti._atan, _atan_docstring
+)
+del _atan_docstring
+
+# B02: ===== ATAN2 (x1, x2)
+_atan2_docstring_ = r"""
+atan2(x1, x2, /, \*, out=None, order='K')
+
+Calculates the inverse tangent of the quotient `x1_i/x2_i` for each element
+`x1_i` of the input array `x1` with the respective element `x2_i` of the
+input array `x2`. Each element-wise result is expressed in radians.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued floating-point
+        data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued
+        floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the inverse tangent of the quotient `x1`/`x2`.
+        The returned array must have a real-valued floating-point data type
+        determined by Type Promotion Rules.
+"""
+
+atan2 = BinaryElementwiseFunc(
+    "atan2", ti._atan2_result_type, ti._atan2, _atan2_docstring_
+)
+del _atan2_docstring_
+
+# U07: ===== ATANH (x)
+_atanh_docstring = r"""
+atanh(x, /, \*, out=None, order='K')
+
+Computes hyperbolic inverse tangent for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise hyperbolic inverse tangent, in
+        radians. The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+atanh = UnaryElementwiseFunc(
+    "atanh", ti._atanh_result_type, ti._atanh, _atanh_docstring
+)
+del _atanh_docstring
+
+# B03: ===== BITWISE_AND           (x1, x2)
+_bitwise_and_docstring_ = r"""
+bitwise_and(x1, x2, /, \*, out=None, order='K')
+
+Computes the bitwise AND of the underlying binary representation of each
+element `x1_i` of the input array `x1` with the respective element `x2_i`
+of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer or boolean data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer or boolean data
+        type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_and = BinaryElementwiseFunc(
+    "bitwise_and",
+    ti._bitwise_and_result_type,
+    ti._bitwise_and,
+    _bitwise_and_docstring_,
+    binary_inplace_fn=ti._bitwise_and_inplace,
+)
+del _bitwise_and_docstring_
+
+# B04: ===== BITWISE_LEFT_SHIFT    (x1, x2)
+_bitwise_left_shift_docstring_ = r"""
+bitwise_left_shift(x1, x2, /, \*, out=None, order='K')
+
+Shifts the bits of each element `x1_i` of the input array x1 to the left by
+appending `x2_i` (i.e., the respective element in the input array `x2`) zeros to
+the right of `x1_i`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer data type.
+        Each element must be greater than or equal to 0.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_left_shift = BinaryElementwiseFunc(
+    "bitwise_left_shift",
+    ti._bitwise_left_shift_result_type,
+    ti._bitwise_left_shift,
+    _bitwise_left_shift_docstring_,
+    binary_inplace_fn=ti._bitwise_left_shift_inplace,
+)
+del _bitwise_left_shift_docstring_
+
+# U08: ===== BITWISE_INVERT        (x)
+_bitwise_invert_docstring = r"""
+bitwise_invert(x, /, \*, out=None, order='K')
+
+Inverts (flips) each bit for each element `x_i` of the input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have integer or boolean data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results.
+        The data type of the returned array is same as the data type of the
+        input array.
+"""
+
+bitwise_invert = UnaryElementwiseFunc(
+    "bitwise_invert",
+    ti._bitwise_invert_result_type,
+    ti._bitwise_invert,
+    _bitwise_invert_docstring,
+)
+del _bitwise_invert_docstring
+
+# B05: ===== BITWISE_OR            (x1, x2)
+_bitwise_or_docstring_ = r"""
+bitwise_or(x1, x2, /, \*, out=None, order='K')
+
+Computes the bitwise OR of the underlying binary representation of each
+element `x1_i` of the input array `x1` with the respective element `x2_i`
+of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer or boolean data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer or boolean data
+        type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_or = BinaryElementwiseFunc(
+    "bitwise_or",
+    ti._bitwise_or_result_type,
+    ti._bitwise_or,
+    _bitwise_or_docstring_,
+    binary_inplace_fn=ti._bitwise_or_inplace,
+)
+del _bitwise_or_docstring_
+
+# B06: ===== BITWISE_RIGHT_SHIFT   (x1, x2)
+_bitwise_right_shift_docstring_ = r"""
+bitwise_right_shift(x1, x2, /, \*, out=None, order='K')
+
+Shifts the bits of each element `x1_i` of the input array `x1` to the right
+according to the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer data type.
+        Each element must be greater than or equal to 0.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_right_shift = BinaryElementwiseFunc(
+    "bitwise_right_shift",
+    ti._bitwise_right_shift_result_type,
+    ti._bitwise_right_shift,
+    _bitwise_right_shift_docstring_,
+    binary_inplace_fn=ti._bitwise_right_shift_inplace,
+)
+del _bitwise_right_shift_docstring_
+
+
+# B07: ===== BITWISE_XOR           (x1, x2)
+_bitwise_xor_docstring_ = r"""
+bitwise_xor(x1, x2, /, \*, out=None, order='K')
+
+Computes the bitwise XOR of the underlying binary representation of each
+element `x1_i` of the input array `x1` with the respective element `x2_i`
+of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have integer or boolean data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have integer or boolean data
+        type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+bitwise_xor = BinaryElementwiseFunc(
+    "bitwise_xor",
+    ti._bitwise_xor_result_type,
+    ti._bitwise_xor,
+    _bitwise_xor_docstring_,
+    binary_inplace_fn=ti._bitwise_xor_inplace,
+)
+del _bitwise_xor_docstring_
+
+# U09: ==== CEIL          (x)
+_ceil_docstring = r"""
+ceil(x, /, \*, out=None, order='K')
+
+Returns the ceiling for each element `x_i` for input array `x`.
+
+The ceil of `x_i` is the smallest integer `n`, such that `n >= x_i`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a boolean or real-valued data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise ceiling.
+"""
+
+ceil = UnaryElementwiseFunc(
+    "ceil", ti._ceil_result_type, ti._ceil, _ceil_docstring
+)
+del _ceil_docstring
+
+# U10: ==== CONJ          (x)
+_conj_docstring = r"""
+conj(x, /, \*, out=None, order='K')
+
+Computes conjugate of each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise conjugate values.
+"""
+
+conj = UnaryElementwiseFunc(
+    "conj", ti._conj_result_type, ti._conj, _conj_docstring
+)
+del _conj_docstring
+
+# U11: ==== COS           (x)
+_cos_docstring = r"""
+cos(x, /, \*, out=None, order='K')
+
+Computes cosine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise cosine. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+cos = UnaryElementwiseFunc("cos", ti._cos_result_type, ti._cos, _cos_docstring)
+del _cos_docstring
+
+# U12: ==== COSH          (x)
+_cosh_docstring = r"""
+cosh(x, /, \*, out=None, order='K')
+
+Computes hyperbolic cosine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise hyperbolic cosine. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+cosh = UnaryElementwiseFunc(
+    "cosh", ti._cosh_result_type, ti._cosh, _cosh_docstring
+)
+del _cosh_docstring
+
+# B08: ==== DIVIDE        (x1, x2)
+_divide_docstring_ = r"""
+divide(x1, x2, /, \*, out=None, order='K')
+
+Calculates the ratio for each element `x1_i` of the input array `x1` with
+the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a floating-point data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise division. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+divide = BinaryElementwiseFunc(
+    "divide",
+    ti._divide_result_type,
+    ti._divide,
+    _divide_docstring_,
+    binary_inplace_fn=ti._divide_inplace,
+    acceptance_fn=_acceptance_fn_divide,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _divide_docstring_
+
+# B09: ==== EQUAL         (x1, x2)
+_equal_docstring_ = r"""
+equal(x1, x2, /, \*, out=None, order='K')
+
+Calculates equality test results for each element `x1_i` of the input array `x1`
+with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise equality comparison.
+        The returned array has a data type of `bool`.
+"""
+
+equal = BinaryElementwiseFunc(
+    "equal",
+    ti._equal_result_type,
+    ti._equal,
+    _equal_docstring_,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _equal_docstring_
+
+# U13: ==== EXP           (x)
+_exp_docstring = r"""
+exp(x, /, \*, out=None, order='K')
+
+Computes the exponential for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise exponential of `x`.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+exp = UnaryElementwiseFunc("exp", ti._exp_result_type, ti._exp, _exp_docstring)
+del _exp_docstring
+
+# B10: ==== FLOOR_DIVIDE  (x1, x2)
+_floor_divide_docstring_ = r"""
+floor_divide(x1, x2, /, \*, out=None, order='K')
+
+Calculates the ratio for each element `x1_i` of the input array `x1` with
+the respective element `x2_i` of the input array `x2` to the greatest
+integer-value number that is not greater than the division result.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise floor of division.
+        The data type of the returned array is determined by the Type
+        Promotion Rules.
+"""
+
+floor_divide = BinaryElementwiseFunc(
+    "floor_divide",
+    ti._floor_divide_result_type,
+    ti._floor_divide,
+    _floor_divide_docstring_,
+    binary_inplace_fn=ti._floor_divide_inplace,
+)
+del _floor_divide_docstring_
+
+# B11: ==== GREATER       (x1, x2)
+_greater_docstring_ = r"""
+greater(x1, x2, /, \*, out=None, order='K')
+
+Computes the greater-than test results for each element `x1_i` of
+the input array `x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise greater-than comparison.
+        The returned array has a data type of `bool`.
+"""
+
+greater = BinaryElementwiseFunc(
+    "greater",
+    ti._greater_result_type,
+    ti._greater,
+    _greater_docstring_,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _greater_docstring_
+
+# B12: ==== GREATER_EQUAL (x1, x2)
+_greater_equal_docstring_ = r"""
+greater_equal(x1, x2, /, \*, out=None, order='K')
+
+Computes the greater-than or equal-to test results for each element `x1_i` of
+the input array `x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise greater-than or equal-to
+        comparison.
+        The returned array has a data type of `bool`.
+"""
+
+greater_equal = BinaryElementwiseFunc(
+    "greater_equal",
+    ti._greater_equal_result_type,
+    ti._greater_equal,
+    _greater_equal_docstring_,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _greater_equal_docstring_
+
+# U14: ==== EXPM1         (x)
+_expm1_docstring = r"""
+expm1(x, /, \*, out=None, order='K')
+
+Computes the exponential minus 1 for each element `x_i` of input array `x`.
+
+This function calculates `exp(x) - 1.0` more accurately for small values of `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise `exp(x) - 1` results.
+        The data type of the returned array is determined by the Type
+        Promotion Rules.
+"""
+
+expm1 = UnaryElementwiseFunc(
+    "expm1", ti._expm1_result_type, ti._expm1, _expm1_docstring
+)
+del _expm1_docstring
+
+# U15: ==== FLOOR         (x)
+_floor_docstring = r"""
+floor(x, /, \*, out=None, order='K')
+
+Returns the floor for each element `x_i` for input array `x`.
+
+The floor of `x_i` is the largest integer `n`, such that `n <= x_i`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a boolean or real-valued data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise floor.
+"""
+
+floor = UnaryElementwiseFunc(
+    "floor", ti._floor_result_type, ti._floor, _floor_docstring
+)
+del _floor_docstring
+
+# U16: ==== IMAG        (x)
+_imag_docstring = r"""
+imag(x, /, \*, out=None, order='K')
+
+Computes imaginary part of each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise imaginary component of input.
+        If the input is a real-valued data type, the returned array has
+        the same data type. If the input is a complex floating-point
+        data type, the returned array has a floating-point data type
+        with the same floating-point precision as complex input.
+"""
+
+imag = UnaryElementwiseFunc(
+    "imag", ti._imag_result_type, ti._imag, _imag_docstring
+)
+del _imag_docstring
+
+# U17: ==== ISFINITE    (x)
+_isfinite_docstring_ = r"""
+isfinite(x, /, \*, out=None, order='K')
+
+Test if each element of input array is a finite number.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array which is True where `x` is not positive infinity,
+        negative infinity, or NaN, False otherwise.
+        The data type of the returned array is `bool`.
+"""
+
+isfinite = UnaryElementwiseFunc(
+    "isfinite", ti._isfinite_result_type, ti._isfinite, _isfinite_docstring_
+)
+del _isfinite_docstring_
+
+# U18: ==== ISINF       (x)
+_isinf_docstring_ = r"""
+isinf(x, /, \*, out=None, order='K')
+
+Test if each element of input array is an infinity.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array which is True where `x` is positive or negative infinity,
+        False otherwise. The data type of the returned array is `bool`.
+"""
+
+isinf = UnaryElementwiseFunc(
+    "isinf", ti._isinf_result_type, ti._isinf, _isinf_docstring_
+)
+del _isinf_docstring_
+
+# U19: ==== ISNAN       (x)
+_isnan_docstring_ = r"""
+isnan(x, /, \*, out=None, order='K')
+
+Test if each element of an input array is a NaN.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array which is True where x is NaN, False otherwise.
+        The data type of the returned array is `bool`.
+"""
+
+isnan = UnaryElementwiseFunc(
+    "isnan", ti._isnan_result_type, ti._isnan, _isnan_docstring_
+)
+del _isnan_docstring_
+
+# B13: ==== LESS        (x1, x2)
+_less_docstring_ = r"""
+less(x1, x2, /, \*, out=None, order='K')
+
+Computes the less-than test results for each element `x1_i` of
+the input array `x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise less-than comparison.
+        The returned array has a data type of `bool`.
+"""
+
+less = BinaryElementwiseFunc(
+    "less",
+    ti._less_result_type,
+    ti._less,
+    _less_docstring_,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _less_docstring_
+
+
+# B14: ==== LESS_EQUAL  (x1, x2)
+_less_equal_docstring_ = r"""
+less_equal(x1, x2, /, \*, out=None, order='K')
+
+Computes the less-than or equal-to test results for each element `x1_i` of
+the input array `x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise less-than or equal-to
+        comparison. The returned array has a data type of `bool`.
+"""
+
+less_equal = BinaryElementwiseFunc(
+    "less_equal",
+    ti._less_equal_result_type,
+    ti._less_equal,
+    _less_equal_docstring_,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _less_equal_docstring_
+
+# U20: ==== LOG         (x)
+_log_docstring = r"""
+log(x, /, \*, out=None, order='K')
+
+Computes the natural logarithm for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise natural logarithm values.
+        The data type of the returned array is determined by the Type
+        Promotion Rules.
+"""
+
+log = UnaryElementwiseFunc("log", ti._log_result_type, ti._log, _log_docstring)
+del _log_docstring
+
+# U21: ==== LOG1P       (x)
+_log1p_docstring = r"""
+log1p(x, /, \*, out=None, order='K')
+
+Computes the natural logarithm of (1 + `x`) for each element `x_i` of input
+array `x`.
+
+This function calculates `log(1 + x)` more accurately for small values of `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise `log(1 + x)` results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+log1p = UnaryElementwiseFunc(
+    "log1p", ti._log1p_result_type, ti._log1p, _log1p_docstring
+)
+del _log1p_docstring
+
+# U22: ==== LOG2        (x)
+_log2_docstring_ = r"""
+log2(x, /, \*, out=None, order='K')
+
+Computes the base-2 logarithm for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise base-2 logarithm of `x`.
+        The data type of the returned array is determined by the
+        Type Promotion Rules.
+"""
+
+log2 = UnaryElementwiseFunc(
+    "log2", ti._log2_result_type, ti._log2, _log2_docstring_
+)
+del _log2_docstring_
+
+# U23: ==== LOG10       (x)
+_log10_docstring_ = r"""
+log10(x, /, \*, out=None, order='K')
+
+Computes the base-10 logarithm for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: `"K"`.
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise base-10 logarithm of `x`.
+        The data type of the returned array is determined by the
+        Type Promotion Rules.
+"""
+
+log10 = UnaryElementwiseFunc(
+    "log10", ti._log10_result_type, ti._log10, _log10_docstring_
+)
+del _log10_docstring_
+
+# B15: ==== LOGADDEXP   (x1, x2)
+_logaddexp_docstring_ = r"""
+logaddexp(x1, x2, /, \*, out=None, order='K')
+
+Calculates the natural logarithm of the sum of exponentials for each element
+`x1_i` of the input array `x1` with the respective element `x2_i` of the input
+array `x2`.
+
+This function calculates `log(exp(x1) + exp(x2))` more accurately for small
+values of `x`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued floating-point data
+        type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued floating-point
+        data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+logaddexp = BinaryElementwiseFunc(
+    "logaddexp", ti._logaddexp_result_type, ti._logaddexp, _logaddexp_docstring_
+)
+del _logaddexp_docstring_
+
+# B16: ==== LOGICAL_AND (x1, x2)
+_logical_and_docstring_ = r"""
+logical_and(x1, x2, /, \*, out=None, order='K')
+
+Computes the logical AND for each element `x1_i` of the input array `x1` with
+the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise logical AND results.
+"""
+logical_and = BinaryElementwiseFunc(
+    "logical_and",
+    ti._logical_and_result_type,
+    ti._logical_and,
+    _logical_and_docstring_,
+)
+del _logical_and_docstring_
+
+# B17: ==== LOGICAL_OR  (x1, x2)
+_logical_or_docstring_ = r"""
+logical_or(x1, x2, /, \*, out=None, order='K')
+
+Computes the logical OR for each element `x1_i` of the input array `x1`
+with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise logical OR results.
+"""
+logical_or = BinaryElementwiseFunc(
+    "logical_or",
+    ti._logical_or_result_type,
+    ti._logical_or,
+    _logical_or_docstring_,
+)
+del _logical_or_docstring_
+
+# B18: ==== LOGICAL_XOR (x1, x2)
+_logical_xor_docstring_ = r"""
+logical_xor(x1, x2, /, \*, out=None, order='K')
+
+Computes the logical XOR for each element `x1_i` of the input array `x1`
+with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise logical XOR results.
+"""
+logical_xor = BinaryElementwiseFunc(
+    "logical_xor",
+    ti._logical_xor_result_type,
+    ti._logical_xor,
+    _logical_xor_docstring_,
+)
+del _logical_xor_docstring_
+
+# U24: ==== LOGICAL_NOT (x)
+_logical_not_docstring = r"""
+logical_not(x, /, \*, out=None, order='K')
+
+Computes the logical NOT for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise logical NOT results.
+"""
+
+logical_not = UnaryElementwiseFunc(
+    "logical_not",
+    ti._logical_not_result_type,
+    ti._logical_not,
+    _logical_not_docstring,
+)
+del _logical_not_docstring
+
+# B26: ==== MAXIMUM    (x1, x2)
+_maximum_docstring_ = r"""
+maximum(x1, x2, /, \*, out=None, order='K')
+
+Compares two input arrays `x1` and `x2` and returns a new array containing the
+element-wise maxima.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise maxima. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+maximum = BinaryElementwiseFunc(
+    "maximum",
+    ti._maximum_result_type,
+    ti._maximum,
+    _maximum_docstring_,
+)
+del _maximum_docstring_
+
+# B27: ==== MINIMUM    (x1, x2)
+_minimum_docstring_ = r"""
+minimum(x1, x2, /, \*, out=None, order='K')
+
+Compares two input arrays `x1` and `x2` and returns a new array containing the
+element-wise minima.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise minima. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+minimum = BinaryElementwiseFunc(
+    "minimum",
+    ti._minimum_result_type,
+    ti._minimum,
+    _minimum_docstring_,
+)
+del _minimum_docstring_
+
+# B19: ==== MULTIPLY    (x1, x2)
+_multiply_docstring_ = r"""
+multiply(x1, x2, /, \*, out=None, order='K')
+
+Calculates the product for each element `x1_i` of the input array `x1` with the
+respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise products. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+multiply = BinaryElementwiseFunc(
+    "multiply",
+    ti._multiply_result_type,
+    ti._multiply,
+    _multiply_docstring_,
+    binary_inplace_fn=ti._multiply_inplace,
+)
+del _multiply_docstring_
+
+# U25: ==== NEGATIVE    (x)
+_negative_docstring_ = r"""
+negative(x, /, \*, out=None, order='K')
+
+Computes the numerical negative for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a numeric data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the negative of `x`.
+"""
+
+negative = UnaryElementwiseFunc(
+    "negative",
+    ti._negative_result_type,
+    ti._negative,
+    _negative_docstring_,
+    acceptance_fn=_acceptance_fn_negative,
+)
+del _negative_docstring_
+
+# B28: ==== NEXTAFTER    (x1, x2)
+_nextafter_docstring_ = r"""
+nextafter(x1, x2, /, \*, out=None, order='K')
+
+Calculates the next floating-point value after element `x1_i` of the input
+array `x1` toward the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued floating-point data
+        type.
+    x2 (usm_ndarray):
+        Second input array, expected to have a real-valued floating-point data
+        type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise next representable values of `x1`
+        in the direction of `x2`. The data type of the returned array is
+        determined by the Type Promotion Rules.
+"""
+nextafter = BinaryElementwiseFunc(
+    "nextafter",
+    ti._nextafter_result_type,
+    ti._nextafter,
+    _nextafter_docstring_,
+)
+del _nextafter_docstring_
+
+# B20: ==== NOT_EQUAL   (x1, x2)
+_not_equal_docstring_ = r"""
+not_equal(x1, x2, /, \*, out=None, order='K')
+
+Calculates inequality test results for each element `x1_i` of the
+input array `x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array.
+    x2 (usm_ndarray):
+        Second input array.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise inequality comparison.
+        The returned array has a data type of `bool`.
+"""
+
+not_equal = BinaryElementwiseFunc(
+    "not_equal",
+    ti._not_equal_result_type,
+    ti._not_equal,
+    _not_equal_docstring_,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _not_equal_docstring_
+
+# U26: ==== POSITIVE    (x)
+_positive_docstring_ = r"""
+positive(x, /, \*, out=None, order='K')
+
+Computes the numerical positive for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a numeric data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the positive of `x`.
+"""
+
+positive = UnaryElementwiseFunc(
+    "positive", ti._positive_result_type, ti._positive, _positive_docstring_
+)
+del _positive_docstring_
+
+# B21: ==== POW         (x1, x2)
+_pow_docstring_ = r"""
+pow(x1, x2, /, \*, out=None, order='K')
+
+Calculates `x1_i` raised to `x2_i` for each element `x1_i` of the input array
+`x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a numeric data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the bases in `x1` raised to the exponents in `x2`
+        element-wise. The data type of the returned array is determined by the
+        Type Promotion Rules.
+"""
+pow = BinaryElementwiseFunc(
+    "pow",
+    ti._pow_result_type,
+    ti._pow,
+    _pow_docstring_,
+    binary_inplace_fn=ti._pow_inplace,
+)
+del _pow_docstring_
+
+# U27: ==== REAL        (x)
+_real_docstring = r"""
+real(x, /, \*, out=None, order='K')
+
+Computes real part of each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise real component of input.
+        If the input is a real-valued data type, the returned array has
+        the same data type. If the input is a complex floating-point
+        data type, the returned array has a floating-point data type
+        with the same floating-point precision as complex input.
+"""
+
+real = UnaryElementwiseFunc(
+    "real", ti._real_result_type, ti._real, _real_docstring
+)
+del _real_docstring
+
+# B22: ==== REMAINDER   (x1, x2)
+_remainder_docstring_ = r"""
+remainder(x1, x2, /, \*, out=None, order='K')
+
+Calculates the remainder of division for each element `x1_i` of the input array
+`x1` with the respective element `x2_i` of the input array `x2`.
+
+This function is equivalent to the Python modulus operator.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise remainders. Each remainder has the
+        same sign as respective element `x2_i`. The data type of the returned
+        array is determined by the Type Promotion Rules.
+"""
+remainder = BinaryElementwiseFunc(
+    "remainder",
+    ti._remainder_result_type,
+    ti._remainder,
+    _remainder_docstring_,
+    binary_inplace_fn=ti._remainder_inplace,
+)
+del _remainder_docstring_
+
+# U28: ==== ROUND       (x)
+_round_docstring = r"""
+round(x, /, \*, out=None, order='K')
+
+Rounds each element `x_i` of the input array `x` to
+the nearest integer-valued number.
+
+When two integers are equally close to `x_i`, the result is the nearest even
+integer to `x_i`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a numeric data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise rounded values.
+"""
+
+round = UnaryElementwiseFunc(
+    "round",
+    ti._round_result_type,
+    ti._round,
+    _round_docstring,
+    acceptance_fn=_acceptance_fn_round,
+)
+del _round_docstring
+
+# U29: ==== SIGN        (x)
+_sign_docstring = r"""
+sign(x, /, \*, out=None, order='K')
+
+Computes an indication of the sign of each element `x_i` of input array `x`
+using the signum function.
+
+The signum function returns `-1` if `x_i` is less than `0`,
+`0` if `x_i` is equal to `0`, and `1` if `x_i` is greater than `0`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a numeric data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise result of the signum function. The
+        data type of the returned array is determined by the Type Promotion
+        Rules.
+"""
+
+sign = UnaryElementwiseFunc(
+    "sign", ti._sign_result_type, ti._sign, _sign_docstring
+)
+del _sign_docstring
+
+# U30: ==== SIN         (x)
+_sin_docstring = r"""
+sin(x, /, \*, out=None, order='K')
+
+Computes sine for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real-valued floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise sine. The data type of the
+        returned array is determined by the Type Promotion Rules.
+"""
+
+sin = UnaryElementwiseFunc("sin", ti._sin_result_type, ti._sin, _sin_docstring)
+del _sin_docstring
+
+# U31: ==== SINH        (x)
+_sinh_docstring = r"""
+sinh(x, /, \*, out=None, order='K')
+
+Computes hyperbolic sine for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise hyperbolic sine. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+sinh = UnaryElementwiseFunc(
+    "sinh", ti._sinh_result_type, ti._sinh, _sinh_docstring
+)
+del _sinh_docstring
+
+# U32: ==== SQUARE      (x)
+_square_docstring_ = r"""
+square(x, /, \*, out=None, order='K')
+
+Squares each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise squares of `x`. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+
+square = UnaryElementwiseFunc(
+    "square", ti._square_result_type, ti._square, _square_docstring_
+)
+del _square_docstring_
+
+# U33: ==== SQRT        (x)
+_sqrt_docstring_ = r"""
+sqrt(x, /, \*, out=None, order='K')
+
+Computes the positive square-root for each element `x_i` of input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise positive square-roots of `x`. The
+        data type of the returned array is determined by the Type Promotion
+        Rules.
+"""
+
+sqrt = UnaryElementwiseFunc(
+    "sqrt", ti._sqrt_result_type, ti._sqrt, _sqrt_docstring_
+)
+del _sqrt_docstring_
+
+# B23: ==== SUBTRACT    (x1, x2)
+_subtract_docstring_ = r"""
+subtract(x1, x2, /, \*, out=None, order='K')
+
+Calculates the difference between each element `x1_i` of the input
+array `x1` and the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a numeric data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise differences. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+subtract = BinaryElementwiseFunc(
+    "subtract",
+    ti._subtract_result_type,
+    ti._subtract,
+    _subtract_docstring_,
+    binary_inplace_fn=ti._subtract_inplace,
+    acceptance_fn=_acceptance_fn_subtract,
+)
+del _subtract_docstring_
+
+# U34: ==== TAN         (x)
+_tan_docstring = r"""
+tan(x, /, \*, out=None, order='K')
+
+Computes tangent for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise tangent. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+tan = UnaryElementwiseFunc("tan", ti._tan_result_type, ti._tan, _tan_docstring)
+del _tan_docstring
+
+# U35: ==== TANH        (x)
+_tanh_docstring = r"""
+tanh(x, /, \*, out=None, order='K')
+
+Computes hyperbolic tangent for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise hyperbolic tangent. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+tanh = UnaryElementwiseFunc(
+    "tanh", ti._tanh_result_type, ti._tanh, _tanh_docstring
+)
+del _tanh_docstring
+
+# U36: ==== TRUNC       (x)
+_trunc_docstring = r"""
+trunc(x, /, \*, out=None, order='K')
+
+Returns the truncated value for each element `x_i` for input array `x`.
+
+The truncated value of the scalar `x` is the nearest integer i which is
+closer to zero than `x` is. In short, the fractional part of the
+signed number `x` is discarded.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a boolean or real-valued data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise division. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+trunc = UnaryElementwiseFunc(
+    "trunc", ti._trunc_result_type, ti._trunc, _trunc_docstring
+)
+del _trunc_docstring
+
+# B24: ==== HYPOT        (x1, x2)
+_hypot_docstring_ = r"""
+hypot(x1, x2, /, \*, out=None, order='K')
+
+Computes the square root of the sum of squares for each element `x1_i` of the
+input array `x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued floating-point data
+        type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued floating-point
+        data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise hypotenuse. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+hypot = BinaryElementwiseFunc(
+    "hypot", ti._hypot_result_type, ti._hypot, _hypot_docstring_
+)
+del _hypot_docstring_
+
+# U37: ==== CBRT        (x)
+_cbrt_docstring_ = r"""
+cbrt(x, /, \*, out=None, order='K')
+
+Computes the cube-root for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real-valued floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise cube-root.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+cbrt = UnaryElementwiseFunc(
+    "cbrt", ti._cbrt_result_type, ti._cbrt, _cbrt_docstring_
+)
+del _cbrt_docstring_
+
+# U38: ==== EXP2        (x)
+_exp2_docstring_ = r"""
+exp2(x, /, \*, out=None, order='K')
+
+Computes the base-2 exponential for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise base-2 exponentials.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+exp2 = UnaryElementwiseFunc(
+    "exp2", ti._exp2_result_type, ti._exp2, _exp2_docstring_
+)
+del _exp2_docstring_
+
+# B25: ==== COPYSIGN    (x1, x2)
+_copysign_docstring_ = r"""
+copysign(x1, x2, /, \*, out=None, order='K')
+
+Composes a floating-point value with the magnitude of `x1_i` and the sign of
+`x2_i` for each element of input arrays `x1` and `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued floating-point data
+        type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued floating-point
+        data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+copysign = BinaryElementwiseFunc(
+    "copysign",
+    ti._copysign_result_type,
+    ti._copysign,
+    _copysign_docstring_,
+)
+del _copysign_docstring_
+
+# U39: ==== RSQRT        (x)
+_rsqrt_docstring_ = r"""
+rsqrt(x, /, \*, out=None, order='K')
+
+Computes the reciprocal square-root for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real-valued floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise reciprocal square-root.
+        The returned array has a floating-point data type determined by
+        the Type Promotion Rules.
+"""
+
+rsqrt = UnaryElementwiseFunc(
+    "rsqrt", ti._rsqrt_result_type, ti._rsqrt, _rsqrt_docstring_
+)
+del _rsqrt_docstring_
+
+# U40: ==== PROJ        (x)
+_proj_docstring = r"""
+proj(x, /, \*, out=None, order='K')
+
+Computes projection of each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a complex data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise projection.
+"""
+
+proj = UnaryElementwiseFunc(
+    "proj", ti._proj_result_type, ti._proj, _proj_docstring
+)
+del _proj_docstring
+
+# U41: ==== SIGNBIT        (x)
+_signbit_docstring = r"""
+signbit(x, /, \*, out=None, order='K')
+
+Computes an indication of whether the sign bit of each element `x_i` of
+input array `x` is set.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real-valued floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise signbit results. The returned array
+        must have a data type of `bool`.
+"""
+
+signbit = UnaryElementwiseFunc(
+    "signbit", ti._signbit_result_type, ti._signbit, _signbit_docstring
+)
+del _signbit_docstring
+
+# U42: ==== RECIPROCAL        (x)
+_reciprocal_docstring = r"""
+reciprocal(x, /, \*, out=None, order='K')
+
+Computes the reciprocal of each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise reciprocals.
+        The returned array has a floating-point data type determined
+        by the Type Promotion Rules.
+"""
+
+reciprocal = UnaryElementwiseFunc(
+    "reciprocal",
+    ti._reciprocal_result_type,
+    ti._reciprocal,
+    _reciprocal_docstring,
+    acceptance_fn=_acceptance_fn_reciprocal,
+)
+del _reciprocal_docstring
+
+# U43: ==== ANGLE        (x)
+_angle_docstring = r"""
+angle(x, /, \*, out=None, order='K')
+
+Computes the phase angle (also called the argument) of each element `x_i` for
+input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a complex floating-point data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise phase angles.
+        The returned array has a floating-point data type determined
+        by the Type Promotion Rules.
+"""
+
+angle = UnaryElementwiseFunc(
+    "angle",
+    ti._angle_result_type,
+    ti._angle,
+    _angle_docstring,
+)
+del _angle_docstring
+
+del ti
diff --git a/dpnp/tensor/_flags.pyx b/dpnp/tensor/_flags.pyx
new file mode 100644
index 000000000000..322d52bd56c7
--- /dev/null
+++ b/dpnp/tensor/_flags.pyx
@@ -0,0 +1,175 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+from libcpp cimport bool as cpp_bool
+
+from ._usmarray cimport (
+    USM_ARRAY_C_CONTIGUOUS,
+    USM_ARRAY_F_CONTIGUOUS,
+    USM_ARRAY_WRITABLE,
+    usm_ndarray,
+)
+
+
+cdef cpp_bool _check_bit(int flag, int mask):
+    return (flag & mask) == mask
+
+
+cdef class Flags:
+    """
+    Helper class to query the flags of a :class:`dpctl.tensor.usm_ndarray`
+    instance, which describe how the instance interfaces with its underlying
+    memory.
+    """
+    cdef int flags_
+    cdef usm_ndarray arr_
+
+    def __cinit__(self, usm_ndarray arr, int flags):
+        self.arr_ = arr
+        self.flags_ = flags
+
+    @property
+    def flags(self):
+        """
+        Integer representation of the memory layout flags of
+        :class:`dpctl.tensor.usm_ndarray` instance.
+        """
+        return self.flags_
+
+    @property
+    def c_contiguous(self):
+        """
+        True if the memory layout of the
+        :class:`dpctl.tensor.usm_ndarray` instance is C-contiguous.
+        """
+        return _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
+
+    @property
+    def f_contiguous(self):
+        """
+        True if the memory layout of the
+        :class:`dpctl.tensor.usm_ndarray` instance is F-contiguous.
+        """
+        return _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
+
+    @property
+    def writable(self):
+        """
+        True if :class:`dpctl.tensor.usm_ndarray` instance is writable.
+        """
+        return _check_bit(self.flags_, USM_ARRAY_WRITABLE)
+
+    @writable.setter
+    def writable(self, new_val):
+        if not isinstance(new_val, bool):
+            raise TypeError("Expecting a boolean value")
+        self.arr_._set_writable_flag(new_val)
+
+    @property
+    def fc(self):
+        """
+        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
+        instance is C-contiguous and F-contiguous.
+        """
+        return (
+           _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
+           and _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
+        )
+
+    @property
+    def forc(self):
+        """
+        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
+        instance is C-contiguous or F-contiguous.
+        """
+        return (
+           _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
+           or _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
+        )
+
+    @property
+    def fnc(self):
+        """
+        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
+        instance is F-contiguous and not C-contiguous.
+        """
+        return (
+           _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
+           and not _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
+        )
+
+    @property
+    def contiguous(self):
+        """
+        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
+        instance is C-contiguous and F-contiguous.
+        Equivalent to `forc.`
+        """
+        return self.forc
+
+    def __getitem__(self, name):
+        if name in ["C_CONTIGUOUS", "C"]:
+            return self.c_contiguous
+        elif name in ["F_CONTIGUOUS", "F"]:
+            return self.f_contiguous
+        elif name in ["WRITABLE", "W"]:
+            return self.writable
+        elif name == "FC":
+            return self.fc
+        elif name == "FNC":
+            return self.fnc
+        elif name in ["FORC", "CONTIGUOUS"]:
+            return self.forc
+
+    def __setitem__(self, name, val):
+        if name in ["WRITABLE", "W"]:
+            self.writable = val
+        else:
+            raise ValueError(
+                "Only writable ('W' or 'WRITABLE') flag can be set"
+            )
+
+    def __repr__(self):
+        out = []
+        for name in "C_CONTIGUOUS", "F_CONTIGUOUS", "WRITABLE":
+            out.append("  {} : {}".format(name, self[name]))
+        return "\n".join(out)
+
+    def __eq__(self, other):
+        cdef Flags other_
+        if isinstance(other, self.__class__):
+            other_ = <Flags>other
+            return self.flags_ == other_.flags_
+        elif isinstance(other, int):
+            return self.flags_ == <int>other
+        else:
+            return False
diff --git a/dpnp/tensor/_indexing_functions.py b/dpnp/tensor/_indexing_functions.py
new file mode 100644
index 000000000000..9ea0a16bdd03
--- /dev/null
+++ b/dpnp/tensor/_indexing_functions.py
@@ -0,0 +1,633 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+
+from ._copy_utils import (
+    _extract_impl,
+    _nonzero_impl,
+    _put_multi_index,
+    _take_multi_index,
+)
+from ._numpy_helper import normalize_axis_index
+
+
+def _get_indexing_mode(name):
+    modes = {"wrap": 0, "clip": 1}
+    try:
+        return modes[name]
+    except KeyError:
+        raise ValueError(
+            "`mode` must be `wrap` or `clip`." "Got `{}`.".format(name)
+        )
+
+
+def _range(sh_i, i, nd, q, usm_t, dt):
+    ind = dpt.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q)
+    ind.shape = tuple(sh_i if i == j else 1 for j in range(nd))
+    return ind
+
+
+def extract(condition, arr):
+    """extract(condition, arr)
+
+    Returns the elements of an array that satisfies the condition.
+
+    If ``condition`` is boolean ``dpctl.tensor.extract`` is
+    equivalent to ``arr[condition]``.
+
+    Note that ``dpctl.tensor.place`` does the opposite of
+    ``dpctl.tensor.extract``.
+
+    Args:
+       conditions (usm_ndarray):
+            An array whose non-zero or ``True`` entries indicate the element
+            of ``arr`` to extract.
+
+       arr (usm_ndarray):
+            Input array of the same size as ``condition``.
+
+    Returns:
+        usm_ndarray:
+            Rank 1 array of values from ``arr`` where ``condition`` is
+            ``True``.
+    """
+    if not isinstance(condition, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(condition)}"
+        )
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    exec_q = dpt.get_execution_queue(
+        (
+            condition.sycl_queue,
+            arr.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise dpt.ExecutionPlacementError
+    if condition.shape != arr.shape:
+        raise ValueError("Arrays are not of the same size")
+    return _extract_impl(arr, condition)
+
+
+def nonzero(arr):
+    """nonzero(arr)
+
+    Return the indices of non-zero elements.
+
+    Returns a tuple of usm_ndarrays, one for each dimension
+    of ``arr``, containing the indices of the non-zero elements
+    in that dimension. The values of ``arr`` are always tested in
+    row-major, C-style order.
+
+    Args:
+        arr (usm_ndarray):
+            Input array, which has non-zero array rank.
+
+    Returns:
+        Tuple[usm_ndarray, ...]:
+            Indices of non-zero array elements.
+    """
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    if arr.ndim == 0:
+        raise ValueError("Array of positive rank is expected")
+    return _nonzero_impl(arr)
+
+
+def place(arr, mask, vals):
+    """place(arr, mask, vals)
+
+    Change elements of an array based on conditional and input values.
+
+    If ``mask`` is boolean ``dpctl.tensor.place`` is
+    equivalent to ``arr[condition] = vals``.
+
+    Args:
+        arr (usm_ndarray):
+            Array to put data into.
+        mask (usm_ndarray):
+            Boolean mask array. Must have the same size as ``arr``.
+        vals (usm_ndarray, sequence):
+            Values to put into ``arr``. Only the first N elements are
+            used, where N is the number of True values in ``mask``. If
+            ``vals`` is smaller than N, it will be repeated, and if
+            elements of ``arr`` are to be masked, this sequence must be
+            non-empty. Array ``vals`` must be one dimensional.
+    """
+    if not isinstance(arr, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(arr)}"
+        )
+    if not isinstance(mask, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(mask)}"
+        )
+    if not isinstance(vals, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(vals)}"
+        )
+    exec_q = dpt.get_execution_queue(
+        (
+            arr.sycl_queue,
+            mask.sycl_queue,
+            vals.sycl_queue,
+        )
+    )
+    if exec_q is None:
+        raise dpt.ExecutionPlacementError
+    if arr.shape != mask.shape or vals.ndim != 1:
+        raise ValueError("Array sizes are not as required")
+    cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q)
+    _manager = SequentialOrderManager[exec_q]
+    deps_ev = _manager.submitted_events
+    nz_count = ti.mask_positions(
+        mask, cumsum, sycl_queue=exec_q, depends=deps_ev
+    )
+    if nz_count == 0:
+        return
+    if vals.size == 0:
+        raise ValueError("Cannot insert from an empty array!")
+    if vals.dtype == arr.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, arr.dtype)
+    hev, pl_ev = ti._place(
+        dst=arr,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=mask.ndim,
+        rhs=rhs,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(hev, pl_ev)
+
+
+def put(x, indices, vals, /, *, axis=None, mode="wrap"):
+    """put(x, indices, vals, axis=None, mode="wrap")
+
+    Puts values into an array along a given axis at given indices.
+
+    Args:
+        x (usm_ndarray):
+            The array the values will be put into.
+        indices (usm_ndarray):
+            One-dimensional array of indices.
+        vals (usm_ndarray):
+            Array of values to be put into ``x``.
+            Must be broadcastable to the result shape
+            ``x.shape[:axis] + indices.shape + x.shape[axis+1:]``.
+        axis (int, optional):
+            The axis along which the values will be placed.
+            If ``x`` is one-dimensional, this argument is optional.
+            Default: ``None``.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    .. note::
+
+        If input array ``indices`` contains duplicates, a race condition
+        occurs, and the value written into corresponding positions in ``x``
+        may vary from run to run. Preserving sequential semantics in handing
+        the duplicates to achieve deterministic behavior requires additional
+        work, e.g.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl import tensor as dpt
+
+                def put_vec_duplicates(vec, ind, vals):
+                    "Put values into vec, handling possible duplicates in ind"
+                    assert vec.ndim, ind.ndim, vals.ndim == 1, 1, 1
+
+                    # find positions of last occurrences of each
+                    # unique index
+                    ind_flipped = dpt.flip(ind)
+                    ind_uniq = dpt.unique_all(ind_flipped).indices
+                    has_dups = len(ind) != len(ind_uniq)
+
+                    if has_dups:
+                        ind_uniq = dpt.subtract(vec.size - 1, ind_uniq)
+                        ind = dpt.take(ind, ind_uniq)
+                        vals = dpt.take(vals, ind_uniq)
+
+                    dpt.put(vec, ind, vals)
+
+                n = 512
+                ind = dpt.concat((dpt.arange(n), dpt.arange(n, -1, step=-1)))
+                x = dpt.zeros(ind.size, dtype="int32")
+                vals = dpt.arange(ind.size, dtype=x.dtype)
+
+                # Values corresponding to last positions of
+                # duplicate indices are written into the vector x
+                put_vec_duplicates(x, ind, vals)
+
+                parts = (vals[-1:-n-2:-1], dpt.zeros(n, dtype=x.dtype))
+                expected = dpt.concat(parts)
+                assert dpt.all(x == expected)
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
+                type(indices)
+            )
+        )
+    if isinstance(vals, dpt.usm_ndarray):
+        queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type]
+    else:
+        queues_ = [x.sycl_queue, indices.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type]
+    if indices.ndim != 1:
+        raise ValueError(
+            "`indices` expected a 1D array, got `{}`".format(indices.ndim)
+        )
+    if indices.dtype.kind not in "ui":
+        raise IndexError(
+            "`indices` expected integer data type, got `{}`".format(
+                indices.dtype
+            )
+        )
+    exec_q = dpt.get_execution_queue(queues_)
+    if exec_q is None:
+        raise dpt.ExecutionPlacementError
+    vals_usm_type = dpt.get_coerced_usm_type(usm_types_)
+
+    mode = _get_indexing_mode(mode)
+
+    x_ndim = x.ndim
+    if axis is None:
+        if x_ndim > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(
+                    x_ndim
+                )
+            )
+        axis = 0
+
+    if x_ndim > 0:
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
+        x_sh = x.shape
+        if x_sh[axis] == 0 and indices.size != 0:
+            raise IndexError("cannot take non-empty indices from an empty axis")
+        val_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :]
+    else:
+        if axis != 0:
+            raise ValueError("`axis` must be 0 for an array of dimension 0.")
+        val_shape = indices.shape
+
+    if not isinstance(vals, dpt.usm_ndarray):
+        vals = dpt.asarray(
+            vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q
+        )
+    # choose to throw here for consistency with `place`
+    if vals.size == 0:
+        raise ValueError(
+            "cannot put into non-empty indices along an empty axis"
+        )
+    if vals.dtype == x.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, x.dtype)
+    rhs = dpt.broadcast_to(rhs, val_shape)
+
+    _manager = SequentialOrderManager[exec_q]
+    deps_ev = _manager.submitted_events
+    hev, put_ev = ti._put(
+        x, (indices,), rhs, axis, mode, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(hev, put_ev)
+
+
+def put_along_axis(x, indices, vals, /, *, axis=-1, mode="wrap"):
+    """
+    Puts elements into an array at the one-dimensional indices specified by
+    ``indices`` along a provided ``axis``.
+
+    Args:
+        x (usm_ndarray):
+            input array. Must be compatible with ``indices``, except for the
+            axis (dimension) specified by ``axis``.
+        indices (usm_ndarray):
+            array indices. Must have the same rank (i.e., number of dimensions)
+            as ``x``.
+        vals (usm_ndarray):
+            Array of values to be put into ``x``.
+            Must be broadcastable to the shape of ``indices``.
+        axis: int
+            axis along which to select values. If ``axis`` is negative, the
+            function determines the axis along which to select values by
+            counting from the last dimension. Default: ``-1``.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    .. note::
+
+        If input array ``indices`` contains duplicates, a race condition
+        occurs, and the value written into corresponding positions in ``x``
+        may vary from run to run. Preserving sequential semantics in handing
+        the duplicates to achieve deterministic behavior requires additional
+        work.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpnp.tensor.usm_ndarray, got {type(indices)}"
+        )
+    x_nd = x.ndim
+    if x_nd != indices.ndim:
+        raise ValueError(
+            "Number of dimensions in the first and the second "
+            "argument arrays must be equal"
+        )
+    pp = normalize_axis_index(operator.index(axis), x_nd)
+    if isinstance(vals, dpt.usm_ndarray):
+        queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type]
+    else:
+        queues_ = [x.sycl_queue, indices.sycl_queue]
+        usm_types_ = [x.usm_type, indices.usm_type]
+    exec_q = dpt.get_execution_queue(queues_)
+    if exec_q is None:
+        raise dpt.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments. "
+        )
+    out_usm_type = dpt.get_coerced_usm_type(usm_types_)
+    mode_i = _get_indexing_mode(mode)
+    indexes_dt = (
+        dpt.uint64
+        if indices.dtype == dpt.uint64
+        else ti.default_device_index_type(exec_q.sycl_device)
+    )
+    _ind = tuple(
+        (
+            indices
+            if i == pp
+            else _range(x.shape[i], i, x_nd, exec_q, out_usm_type, indexes_dt)
+        )
+        for i in range(x_nd)
+    )
+    return _put_multi_index(x, _ind, 0, vals, mode=mode_i)
+
+
+def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
+    """take(x, indices, axis=None, out=None, mode="wrap")
+
+    Takes elements from an array along a given axis at given indices.
+
+    Args:
+        x (usm_ndarray):
+            The array that elements will be taken from.
+        indices (usm_ndarray):
+            One-dimensional array of indices.
+        axis (int, optional):
+            The axis along which the values will be selected.
+            If ``x`` is one-dimensional, this argument is optional.
+            Default: ``None``.
+        out (Optional[usm_ndarray]):
+            Output array to populate. Array must have the correct
+            shape and the expected data type.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    Returns:
+       usm_ndarray:
+          Array with shape
+          ``x.shape[:axis] + indices.shape + x.shape[axis + 1:]``
+          filled with elements from ``x``.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
+                type(indices)
+            )
+        )
+    if indices.dtype.kind not in "ui":
+        raise IndexError(
+            "`indices` expected integer data type, got `{}`".format(
+                indices.dtype
+            )
+        )
+    if indices.ndim != 1:
+        raise ValueError(
+            "`indices` expected a 1D array, got `{}`".format(indices.ndim)
+        )
+    exec_q = dpt.get_execution_queue([x.sycl_queue, indices.sycl_queue])
+    if exec_q is None:
+        raise dpt.ExecutionPlacementError
+    res_usm_type = dpt.get_coerced_usm_type([x.usm_type, indices.usm_type])
+
+    mode = _get_indexing_mode(mode)
+
+    x_ndim = x.ndim
+    if axis is None:
+        if x_ndim > 1:
+            raise ValueError(
+                "`axis` cannot be `None` for array of dimension `{}`".format(
+                    x_ndim
+                )
+            )
+        axis = 0
+
+    if x_ndim > 0:
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
+        x_sh = x.shape
+        if x_sh[axis] == 0 and indices.size != 0:
+            raise IndexError("cannot take non-empty indices from an empty axis")
+        res_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :]
+    else:
+        if axis != 0:
+            raise ValueError("`axis` must be 0 for an array of dimension 0.")
+        res_shape = indices.shape
+
+    dt = x.dtype
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+
+        if out.shape != res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {res_shape}, got {out.shape}"
+            )
+        if dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {dt} is needed, got {out.dtype}"
+            )
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if ti._array_overlap(x, out):
+            out = dpt.empty_like(out)
+    else:
+        out = dpt.empty(
+            res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    deps_ev = _manager.submitted_events
+    hev, take_ev = ti._take(
+        x, (indices,), out, axis, mode, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(hev, take_ev)
+
+    if not (orig_out is None or out is orig_out):
+        # Copy the out data from temporary buffer to original memory
+        ht_e_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=out, dst=orig_out, sycl_queue=exec_q, depends=[take_ev]
+        )
+        _manager.add_event_pair(ht_e_cpy, cpy_ev)
+        out = orig_out
+
+    return out
+
+
+def take_along_axis(x, indices, /, *, axis=-1, mode="wrap"):
+    """
+    Returns elements from an array at the one-dimensional indices specified
+    by ``indices`` along a provided ``axis``.
+
+    Args:
+        x (usm_ndarray):
+            input array. Must be compatible with ``indices``, except for the
+            axis (dimension) specified by ``axis``.
+        indices (usm_ndarray):
+            array indices. Must have the same rank (i.e., number of dimensions)
+            as ``x``.
+        axis: int
+            axis along which to select values. If ``axis`` is negative, the
+            function determines the axis along which to select values by
+            counting from the last dimension. Default: ``-1``.
+        mode (str, optional):
+            How out-of-bounds indices will be handled. Possible values
+            are:
+
+            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
+              negative indices.
+            - ``"clip"``: clips indices to (``0 <= i < n``).
+
+            Default: ``"wrap"``.
+
+    Returns:
+        usm_ndarray:
+            an array having the same data type as ``x``. The returned array has
+            the same rank (i.e., number of dimensions) as ``x`` and a shape
+            determined according to broadcasting rules, except for the axis
+            (dimension) specified by ``axis`` whose size must equal the size
+            of the corresponding axis (dimension) in ``indices``.
+
+    Note:
+        Treatment of the out-of-bound indices in ``indices`` array is controlled
+        by the value of ``mode`` keyword.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+    if not isinstance(indices, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpnp.tensor.usm_ndarray, got {type(indices)}"
+        )
+    x_nd = x.ndim
+    if x_nd != indices.ndim:
+        raise ValueError(
+            "Number of dimensions in the first and the second "
+            "argument arrays must be equal"
+        )
+    pp = normalize_axis_index(operator.index(axis), x_nd)
+    out_usm_type = dpt.get_coerced_usm_type((x.usm_type, indices.usm_type))
+    exec_q = dpt.get_execution_queue((x.sycl_queue, indices.sycl_queue))
+    if exec_q is None:
+        raise dpt.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments. "
+        )
+    mode_i = _get_indexing_mode(mode)
+    indexes_dt = (
+        dpt.uint64
+        if indices.dtype == dpt.uint64
+        else ti.default_device_index_type(exec_q.sycl_device)
+    )
+    _ind = tuple(
+        (
+            indices
+            if i == pp
+            else _range(x.shape[i], i, x_nd, exec_q, out_usm_type, indexes_dt)
+        )
+        for i in range(x_nd)
+    )
+    return _take_multi_index(x, _ind, 0, mode=mode_i)
diff --git a/dpnp/tensor/_linear_algebra_functions.py b/dpnp/tensor/_linear_algebra_functions.py
new file mode 100644
index 000000000000..dcaf99b4423c
--- /dev/null
+++ b/dpnp/tensor/_linear_algebra_functions.py
@@ -0,0 +1,1015 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as tei
+import dpnp.tensor._tensor_impl as ti
+import dpnp.tensor._tensor_linalg_impl as tli
+
+from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
+from ._manipulation_functions import _broadcast_shape_impl
+from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
+from ._type_utils import (
+    _acceptance_fn_default_binary,
+    _find_buf_dtype2,
+    _to_device_supported_dtype,
+)
+
+
+def matrix_transpose(x):
+    r"""matrix_transpose(x)
+
+    Transposes the innermost two dimensions of `x`, where `x` is a
+    2-dimensional matrix or a stack of 2-dimensional matrices.
+
+    To convert from a 1-dimensional array to a 2-dimensional column
+    vector, use x[:, dpt.newaxis].
+
+    Args:
+       x (usm_ndarray):
+          Input array with shape (..., m, n).
+
+    Returns:
+       usm_ndarray:
+          Array with shape (..., n, m).
+    """
+
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+    if x.ndim < 2:
+        raise ValueError(
+            "dpnp.tensor.matrix_transpose requires array to have"
+            "at least 2 dimensions"
+        )
+
+    return x.mT
+
+
+def tensordot(x1, x2, axes=2):
+    r"""tensordot(x1, x2, axes=2)
+
+    Returns a tensor contraction of `x1` and `x2` over specific axes.
+
+    Args:
+        x1 (usm_ndarray):
+            first input array, expected to have numeric data type.
+        x2 (usm_ndarray):
+            second input array, expected to have numeric data type.
+            Corresponding contracted axes of `x1` and `x2` must be equal.
+        axes (Union[int, Tuple[Sequence[int], Sequence[int]]):
+            number of axes to contract or explicit sequences of axes for
+            `x1` and `x2`, respectively. If `axes` is an integer equal to `N`,
+            then the contraction is performed over last `N` axes of `x1` and
+            the first `N` axis of `x2` in order. The size of each corresponding
+            axis must match and must be non-negative.
+
+                * if `N` equals `0`, the result is the tensor outer product
+                * if `N` equals `1`, the result is the tensor dot product
+                * if `N` equals `2`, the result is the tensor double
+                  contraction (default).
+
+            If `axes` is a tuple of two sequences `(x1_axes, x2_axes)`, the
+            first sequence applies to `x1` and the second sequence applies
+            to `x2`. Both sequences must have equal length, and each axis
+            `x1_axes[i]` for `x1` must have the same size as the respective
+            axis `x2_axes[i]` for `x2`. Each sequence must consist of unique
+            integers that specify valid axes for each respective array.
+            For example, if `x1` has rank `N`, a valid axis must reside on the
+            half-open interval `[-N, N)`.
+    Returns:
+        usm_ndarray:
+            an array containing the tensor contraction whose shape consists of
+            the non-contracted axes of the first array `x1`, followed by the
+            non-contracted axes of the second array `x2`. The returned array
+            must have a data type determined by Type Promotion Rules.
+    """
+    if not isinstance(x1, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x1)}")
+    if not isinstance(x2, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x2)}")
+    q1, x1_usm_type = x1.sycl_queue, x1.usm_type
+    q2, x2_usm_type = x2.sycl_queue, x2.usm_type
+    exec_q = dpt.get_execution_queue((q1, q2))
+    if exec_q is None:
+        raise dpt.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments."
+        )
+    res_usm_type = dpt.get_coerced_usm_type(
+        (
+            x1_usm_type,
+            x2_usm_type,
+        )
+    )
+    dpt.validate_usm_type(res_usm_type, allow_none=False)
+    # handle axes and shapes validation
+    x1_nd = x1.ndim
+    x2_nd = x2.ndim
+    x1_shape = x1.shape
+    x2_shape = x2.shape
+    if isinstance(axes, int):
+        if axes < 0:
+            raise ValueError("`axes` integer is expected to be non-negative")
+        n_axes1 = axes
+        n_axes2 = axes
+        axes1 = normalize_axis_tuple(tuple(range(-axes, 0)), x1_nd)
+        axes2 = tuple(range(0, axes))
+    elif isinstance(axes, tuple):
+        if len(axes) != 2:
+            raise ValueError(
+                "`axes` tuple is expected to contain two sequences"
+            )
+        axes1 = tuple(axes[0])
+        axes2 = tuple(axes[1])
+        n_axes1 = len(axes1)
+        n_axes2 = len(axes2)
+    else:
+        raise TypeError("`axes` must be an integer or a tuple of sequences")
+    if n_axes1 != n_axes2:
+        raise ValueError(
+            "number of axes contracted must be the same for each array"
+        )
+    if n_axes1 == 0:
+        arr1 = x1[..., dpt.newaxis]
+        arr2 = x2[dpt.newaxis, ...]
+        n_axes1 = 1
+        n_axes2 = 1
+    else:
+        same_shapes = True
+        for i in range(n_axes1):
+            axis1 = axes1[i]
+            axis2 = axes2[i]
+            same_shapes = same_shapes and (x1_shape[axis1] == x2_shape[axis2])
+        if not same_shapes:
+            raise ValueError("shape mismatch in contracted `tensordot` axes")
+        axes1 = normalize_axis_tuple(axes1, x1_nd)
+        axes2 = normalize_axis_tuple(axes2, x2_nd)
+        perm1 = [i for i in range(x1_nd) if i not in axes1] + list(axes1)
+        perm2 = list(axes2) + [i for i in range(x2_nd) if i not in axes2]
+        arr1 = dpt.permute_dims(x1, perm1)
+        arr2 = dpt.permute_dims(x2, perm2)
+    arr1_outer_nd = arr1.ndim - n_axes1
+    arr2_outer_nd = arr2.ndim - n_axes2
+    res_shape = arr1.shape[:arr1_outer_nd] + arr2.shape[n_axes2:]
+    # dtype validation
+    sycl_dev = exec_q.sycl_device
+    x1_dtype = x1.dtype
+    x2_dtype = x2.dtype
+    buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
+        x1_dtype,
+        x2_dtype,
+        tli._dot_result_type,
+        sycl_dev,
+        acceptance_fn=_acceptance_fn_default_binary,
+    )
+    if res_dt is None:
+        raise TypeError(
+            "function 'tensordot' does not support input types "
+            f"({x1_dtype}, {x2_dtype}), "
+            "and the inputs could not be safely coerced to any "
+            "supported types according to the casting rule ''safe''."
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    if buf1_dt is None and buf2_dt is None:
+        out = dpt.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=exec_q,
+            order="C",
+        )
+        dep_evs = _manager.submitted_events
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=arr1,
+            x2=arr2,
+            batch_dims=0,
+            x1_outer_dims=arr1_outer_nd,
+            x2_outer_dims=arr2_outer_nd,
+            inner_dims=n_axes1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+
+        return out
+
+    elif buf1_dt is None:
+        buf2 = _empty_like_orderK(arr2, buf2_dt)
+
+        dep_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        out = dpt.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=exec_q,
+            order="C",
+        )
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=arr1,
+            x2=buf2,
+            batch_dims=0,
+            x1_outer_dims=arr1_outer_nd,
+            x2_outer_dims=arr2_outer_nd,
+            inner_dims=n_axes1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+
+        return out
+
+    elif buf2_dt is None:
+        buf1 = _empty_like_orderK(arr1, buf1_dt)
+        dep_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        out = dpt.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=exec_q,
+            order="C",
+        )
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=buf1,
+            x2=arr2,
+            batch_dims=0,
+            x1_outer_dims=arr1_outer_nd,
+            x2_outer_dims=arr2_outer_nd,
+            inner_dims=n_axes1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+
+        return out
+
+    buf1 = _empty_like_orderK(arr1, buf1_dt)
+    deps_ev = _manager.submitted_events
+    ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=arr1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+    buf2 = _empty_like_orderK(arr2, buf2_dt)
+    ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=arr2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+    out = dpt.empty(
+        res_shape,
+        dtype=res_dt,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+        order="C",
+    )
+    ht_, dot_ev = tli._dot(
+        x1=buf1,
+        x2=buf2,
+        batch_dims=0,
+        x1_outer_dims=arr1_outer_nd,
+        x2_outer_dims=arr2_outer_nd,
+        inner_dims=n_axes1,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=[copy1_ev, copy2_ev],
+    )
+    _manager.add_event_pair(ht_, dot_ev)
+
+    return out
+
+
+def vecdot(x1, x2, axis=-1):
+    r"""vecdot(x1, x2, axis=-1)
+
+    Computes the (vector) dot product of two arrays.
+
+    Args:
+        x1 (usm_ndarray):
+            first input array.
+        x2 (usm_ndarray):
+            second input array. Input arrays must have compatible
+            shapes along non-contract axes according to broadcasting
+            rules, and must have the same size along the contracted
+            axis. Input arrays should be of numeric type.
+        axis (Optional[int]):
+            axis over which to compute the dot product. The axis must
+            be an integer on the interval `[-N, -1]`, where `N` is
+            ``min(x1.ndim, x2.ndim)``. The axis along which dot product
+            is performed is counted backward from the last axes
+            (that is, `-1` refers to the last axis). By default,
+            dot product is computed over the last axis.
+            Default: `-1`.
+
+    Returns:
+        usm_ndarray:
+            if `x1` and `x2` are both one-dimensional arrays, a
+            zero-dimensional array containing the dot product value
+            is returned; otherwise, a non-zero-dimensional array containing
+            the dot products and having rank `N-1`, where `N` is the rank
+            of the shape of input arrays after broadcasting rules are applied
+            to non-contracted axes.
+    """
+    if not isinstance(x1, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x1)}")
+    if not isinstance(x2, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x2)}")
+    q1, x1_usm_type = x1.sycl_queue, x1.usm_type
+    q2, x2_usm_type = x2.sycl_queue, x2.usm_type
+    exec_q = dpt.get_execution_queue((q1, q2))
+    if exec_q is None:
+        raise dpt.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments."
+        )
+    res_usm_type = dpt.get_coerced_usm_type(
+        (
+            x1_usm_type,
+            x2_usm_type,
+        )
+    )
+    dpt.validate_usm_type(res_usm_type, allow_none=False)
+    # axis and shape validation
+    x1_nd = x1.ndim
+    x2_nd = x2.ndim
+    x1_shape = x1.shape
+    x2_shape = x2.shape
+    if axis >= 0:
+        raise ValueError("`axis` must be negative")
+    axis = operator.index(axis)
+    x1_axis = normalize_axis_index(axis, x1_nd)
+    x2_axis = normalize_axis_index(axis, x2_nd)
+    if x1_shape[x1_axis] != x2_shape[x2_axis]:
+        raise ValueError(
+            "given axis must have the same shape for `x1` and `x2`"
+        )
+    if x1_nd > x2_nd:
+        x2_shape = (1,) * (x1_nd - x2_nd) + x2_shape
+    elif x2_nd > x1_nd:
+        x1_shape = (1,) * (x2_nd - x1_nd) + x1_shape
+    try:
+        broadcast_sh = _broadcast_shape_impl(
+            [
+                x1_shape,
+                x2_shape,
+            ]
+        )
+    except ValueError:
+        raise ValueError("mismatch in `vecdot` dimensions")
+    broadcast_nd = len(broadcast_sh)
+    contracted_axis = normalize_axis_index(axis, broadcast_nd)
+    res_sh = tuple(
+        [broadcast_sh[i] for i in range(broadcast_nd) if i != contracted_axis]
+    )
+    # dtype validation
+    sycl_dev = exec_q.sycl_device
+    x1_dtype = x1.dtype
+    x2_dtype = x2.dtype
+    buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
+        x1_dtype,
+        x2_dtype,
+        tli._dot_result_type,
+        sycl_dev,
+        acceptance_fn=_acceptance_fn_default_binary,
+    )
+    if res_dt is None:
+        raise TypeError(
+            "function 'vecdot' does not support input types "
+            f"({x1_dtype}, {x2_dtype}), "
+            "and the inputs could not be safely coerced to any "
+            "supported types according to the casting rule ''safe''."
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    if buf1_dt is None and buf2_dt is None:
+        if x1.dtype.kind == "c":
+            x1_tmp = _empty_like_orderK(x1, x1.dtype)
+            dep_evs = _manager.submitted_events
+            ht_conj_ev, conj_ev = tei._conj(
+                src=x1, dst=x1_tmp, sycl_queue=exec_q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_conj_ev, conj_ev)
+            x1 = x1_tmp
+        if x1.shape != broadcast_sh:
+            x1 = dpt.broadcast_to(x1, broadcast_sh)
+        if x2.shape != broadcast_sh:
+            x2 = dpt.broadcast_to(x2, broadcast_sh)
+        x1 = dpt.moveaxis(x1, contracted_axis, -1)
+        x2 = dpt.moveaxis(x2, contracted_axis, -1)
+        out = dpt.empty(
+            res_sh,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=exec_q,
+            order="C",
+        )
+        dep_evs = _manager.submitted_events
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=x1,
+            x2=x2,
+            batch_dims=len(res_sh),
+            x1_outer_dims=0,
+            x2_outer_dims=0,
+            inner_dims=1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+        return dpt.reshape(out, res_sh)
+
+    elif buf1_dt is None:
+        if x1.dtype.kind == "c":
+            x1_tmp = _empty_like_orderK(x1, x1.dtype)
+            deps_ev = _manager.submitted_events
+            ht_conj_ev, conj_e = tei._conj(
+                src=x1, dst=x1_tmp, sycl_queue=exec_q, depends=deps_ev
+            )
+            _manager.add_event_pair(ht_conj_ev, conj_e)
+            x1 = x1_tmp
+        buf2 = _empty_like_orderK(x2, buf2_dt)
+        deps_ev = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if x1.shape != broadcast_sh:
+            x1 = dpt.broadcast_to(x1, broadcast_sh)
+        if buf2.shape != broadcast_sh:
+            buf2 = dpt.broadcast_to(buf2, broadcast_sh)
+        x1 = dpt.moveaxis(x1, contracted_axis, -1)
+        buf2 = dpt.moveaxis(buf2, contracted_axis, -1)
+        out = dpt.empty(
+            res_sh,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=exec_q,
+            order="C",
+        )
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=x1,
+            x2=buf2,
+            batch_dims=len(res_sh),
+            x1_outer_dims=0,
+            x2_outer_dims=0,
+            inner_dims=1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+        return dpt.reshape(out, res_sh)
+
+    elif buf2_dt is None:
+        buf1 = _empty_like_orderK(x1, buf1_dt)
+        deps_ev = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if buf1.dtype.kind == "c":
+            ht_conj_ev, conj_ev = tei._conj(
+                src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy_ev]
+            )
+            _manager.add_event_pair(ht_conj_ev, conj_ev)
+        if buf1.shape != broadcast_sh:
+            buf1 = dpt.broadcast_to(buf1, broadcast_sh)
+        if x2.shape != broadcast_sh:
+            x2 = dpt.broadcast_to(x2, broadcast_sh)
+        buf1 = dpt.moveaxis(buf1, contracted_axis, -1)
+        x2 = dpt.moveaxis(x2, contracted_axis, -1)
+        out = dpt.empty(
+            res_sh,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=exec_q,
+            order="C",
+        )
+        deps_ev = _manager.submitted_events
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=buf1,
+            x2=x2,
+            batch_dims=len(res_sh),
+            x1_outer_dims=0,
+            x2_outer_dims=0,
+            inner_dims=1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=deps_ev,
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+        return dpt.reshape(out, res_sh)
+
+    buf1 = _empty_like_orderK(x1, buf1_dt)
+    deps_ev = _manager.submitted_events
+    ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+    if buf1.dtype.kind == "c":
+        ht_conj_ev, conj_ev = tei._conj(
+            src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy1_ev]
+        )
+        _manager.add_event_pair(ht_conj_ev, conj_ev)
+    buf2 = _empty_like_orderK(x2, buf2_dt)
+    ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+    if buf1.shape != broadcast_sh:
+        buf1 = dpt.broadcast_to(buf1, broadcast_sh)
+    if buf2.shape != broadcast_sh:
+        buf2 = dpt.broadcast_to(buf2, broadcast_sh)
+    buf1 = dpt.moveaxis(buf1, contracted_axis, -1)
+    buf2 = dpt.moveaxis(buf2, contracted_axis, -1)
+    out = dpt.empty(
+        res_sh,
+        dtype=res_dt,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+        order="C",
+    )
+    deps_ev = _manager.submitted_events
+    ht_dot_ev, dot_ev = tli._dot(
+        x1=buf1,
+        x2=buf2,
+        batch_dims=len(res_sh),
+        x1_outer_dims=0,
+        x2_outer_dims=0,
+        inner_dims=1,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=deps_ev,
+    )
+    _manager.add_event_pair(ht_dot_ev, dot_ev)
+    return out
+
+
+def matmul(x1, x2, out=None, dtype=None, order="K"):
+    r"""matmul(x1, x2, out=None, order="K")
+
+    Computes the matrix product. Implements the same semantics
+    as the built-in operator `@`.
+
+    Args:
+        x1 (usm_ndarray):
+            first input array. Expected to have numeric data type, and
+            at least one dimension. If `x1` is one-dimensional having
+            shape `(M,)`, and `x2` has more than one dimension, `x1` is
+            effectively treated as a two-dimensional array with shape `(1, M)`,
+            although the prepended dimension is removed from the output array.
+            If `x1` has shape `(..., M, K)`, the innermost two dimensions form
+            matrices on which to perform matrix multiplication.
+        x2 (usm_ndarray):
+            second input array. Expected to have numeric data type, and
+            at least one dimension. If `x2` is one-dimensional having
+            shape `(N,)`, and `x1` has more than one dimension, `x2` is
+            effectively treated as a two-dimensional array with shape `(N, 1)`,
+            although the appended dimension is removed from the output array.
+            If `x2` has shape `(..., K, N)`, the innermost two dimensions form
+            matrices on which to perform matrix multiplication.
+        out (Optional[usm_ndarray]):
+            the array into which the result of the matrix product is written.
+            The data type of `out` must match the expected data type of the
+            result or (if provided) `dtype`.
+            If `None` then a new array is returned. Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the data type of the
+            returned array is determined by the Type Promotion Rules.
+            Default: `None`.
+        order (["K", "C", "F", "A"]):
+            memory layout of the output array, if `out` is `None`, otherwise
+            the `order` parameter value is not used. Default: `K`.
+    Returns:
+        usm_ndarray:
+            * if both `x1` and `x2` are one-dimensional arrays with shape
+              `(N,)`, returned array is a zero-dimensional array containing
+              inner product as its only element.
+            * if `x1` is two-dimensional array with shape `(M, K)` and `x2` is
+              a two-dimensional array with shape `(K, N)`, returned array is a
+              two-dimensional array with shape `(M, N)` and contains the
+              conventional matrix product.
+            * if `x1` is a one-dimensional array with shape `(K,)` and `x2` is
+              an array with shape `(..., K, N)`, returned array contains the
+              conventional matrix product and has shape `(..., N)`.
+            * if `x1` is an array with shape `(..., M, K)` and `x2` is a
+              one-dimensional array with shape `(K,)`, returned array has shape
+              `(..., M)` and contains the conventional matrix product.
+            * if `x1` is a two-dimensional array with shape `(M, K)` and `x2`
+              is an array with shape `(..., K, N)`, returned array contains
+              conventional matrix product for each stacked matrix and has shape
+              `(..., M, N)`.
+            * if `x1` has shape `(..., M, K)` and `x2` is a two-dimensional
+              array with shape `(K, N)`, returned array contains conventional
+              matrix product for each stacked matrix and has shape
+              `(..., M, N)`.
+            * if both `x1` and `x2` have more than two dimensions, returned
+              array contains conventional matrix product for each stacked
+              matrix and has shape determined by broadcasting rules for
+              `x1.shape[:-2]` and `x2.shape[:-2]`.
+
+            The data type of the returned array is determined by the Type
+            Promotion Rules. If either `x1` or `x2` has a complex floating
+            point type, neither argument is complex conjugated or transposed.
+    """
+    if not isinstance(x1, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x1)}")
+    if not isinstance(x2, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x2)}")
+    if order not in ["K", "C", "F", "A"]:
+        order = "K"
+    q1, x1_usm_type = x1.sycl_queue, x1.usm_type
+    q2, x2_usm_type = x2.sycl_queue, x2.usm_type
+    exec_q = dpt.get_execution_queue((q1, q2))
+    if exec_q is None:
+        raise dpt.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments."
+        )
+    res_usm_type = dpt.get_coerced_usm_type(
+        (
+            x1_usm_type,
+            x2_usm_type,
+        )
+    )
+    dpt.validate_usm_type(res_usm_type, allow_none=False)
+
+    x1_nd = x1.ndim
+    x2_nd = x2.ndim
+    if x1_nd == 0 or x2_nd == 0:
+        raise ValueError("one or more operands to `matmul` is 0 dimensional")
+    x1_shape = x1.shape
+    x2_shape = x2.shape
+    appended_axes = []
+    if x1_nd == 1:
+        x1 = x1[dpt.newaxis, :]
+        x1_shape = x1.shape
+        appended_axes.append(-2)
+    if x2_nd == 1:
+        x2 = x2[:, dpt.newaxis]
+        x2_shape = x2.shape
+        appended_axes.append(-1)
+    if x1_shape[-1] != x2_shape[-2]:
+        raise ValueError("mismatch in `matmul` inner dimension")
+    x1_outer_sh = x1_shape[:-2]
+    x2_outer_sh = x2_shape[:-2]
+    try:
+        res_outer_sh = _broadcast_shape_impl(
+            [
+                x1_outer_sh,
+                x2_outer_sh,
+            ]
+        )
+    except ValueError:
+        raise ValueError("mismatch in `matmul` batching dimensions")
+    x1_broadcast_shape = res_outer_sh + x1_shape[-2:]
+    x2_broadcast_shape = res_outer_sh + x2_shape[-2:]
+    res_shape = res_outer_sh + x1_shape[-2:-1] + x2_shape[-1:]
+
+    sycl_dev = exec_q.sycl_device
+    x1_dtype = x1.dtype
+    x2_dtype = x2.dtype
+    if dtype is None:
+        buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
+            x1_dtype,
+            x2_dtype,
+            tli._dot_result_type,
+            sycl_dev,
+            acceptance_fn=_acceptance_fn_default_binary,
+        )
+        if res_dt is None:
+            raise ValueError(
+                "function 'matmul' does not support input types "
+                f"({x1_dtype}, {x2_dtype}), "
+                "and the inputs could not be safely coerced to any "
+                "supported types according to the casting rule ''safe''."
+            )
+    else:
+        res_dt = dpt.dtype(dtype)
+        res_dt = _to_device_supported_dtype(res_dt, sycl_dev)
+        buf1_dt, buf2_dt = None, None
+        if x1_dtype != res_dt:
+            if dpt.can_cast(x1_dtype, res_dt, casting="same_kind"):
+                buf1_dt = res_dt
+            else:
+                raise ValueError(
+                    r"`matmul` input `x1` cannot be cast from "
+                    f"{x1_dtype} to "
+                    f"requested type {res_dt} according to the casting rule "
+                    "''same_kind''."
+                )
+        if x2_dtype != res_dt:
+            if dpt.can_cast(x2_dtype, res_dt, casting="same_kind"):
+                buf2_dt = res_dt
+            else:
+                raise ValueError(
+                    r"`matmul` input `x2` cannot be cast from "
+                    f"{x2_dtype} to "
+                    f"requested type {res_dt} according to the casting rule "
+                    "''same_kind''."
+                )
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+
+        final_res_shape = tuple(
+            res_shape[i]
+            for i in range(-len(res_shape), 0)
+            if i not in appended_axes
+        )
+        if out.shape != final_res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_shape}, got {out.shape}"
+            )
+
+        if appended_axes:
+            out = dpt.expand_dims(out, axis=appended_axes)
+            orig_out = out
+
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+
+        if ti._array_overlap(x1, out) and buf1_dt is None:
+            out = dpt.empty_like(out)
+
+        if ti._array_overlap(x2, out) and buf2_dt is None:
+            # should not reach if out is reallocated
+            # after being checked against x1
+            out = dpt.empty_like(out)
+
+    if order == "A":
+        order = (
+            "F"
+            if all(
+                arr.flags.f_contiguous
+                for arr in (
+                    x1,
+                    x2,
+                )
+            )
+            else "C"
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    if buf1_dt is None and buf2_dt is None:
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x1, x2, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+        if x1.shape != x1_broadcast_shape:
+            x1 = dpt.broadcast_to(x1, x1_broadcast_shape)
+        if x2.shape != x2_broadcast_shape:
+            x2 = dpt.broadcast_to(x2, x2_broadcast_shape)
+        deps_evs = _manager.submitted_events
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=x1,
+            x2=x2,
+            batch_dims=len(res_shape[:-2]),
+            x1_outer_dims=1,
+            x2_outer_dims=1,
+            inner_dims=1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=deps_evs,
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[dot_ev],
+            )
+            _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+            out = orig_out
+        if appended_axes:
+            out = dpt.squeeze(out, tuple(appended_axes))
+        return out
+    elif buf1_dt is None:
+        if order == "K":
+            buf2 = _empty_like_orderK(x2, buf2_dt)
+        else:
+            buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order)
+        deps_evs = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_evs
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    x1, buf2, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        if x1.shape != x1_broadcast_shape:
+            x1 = dpt.broadcast_to(x1, x1_broadcast_shape)
+        if buf2.shape != x2_broadcast_shape:
+            buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape)
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=x1,
+            x2=buf2,
+            batch_dims=len(res_shape[:-2]),
+            x1_outer_dims=1,
+            x2_outer_dims=1,
+            inner_dims=1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[dot_ev],
+            )
+            _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+            out = orig_out
+        if appended_axes:
+            out = dpt.squeeze(out, tuple(appended_axes))
+        return out
+
+    elif buf2_dt is None:
+        if order == "K":
+            buf1 = _empty_like_orderK(x1, buf1_dt)
+        else:
+            buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order)
+        deps_ev = _manager.submitted_events
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
+        )
+        _manager.add_event_pair(ht_copy_ev, copy_ev)
+        if out is None:
+            if order == "K":
+                out = _empty_like_pair_orderK(
+                    buf1, x2, res_dt, res_shape, res_usm_type, exec_q
+                )
+            else:
+                out = dpt.empty(
+                    res_shape,
+                    dtype=res_dt,
+                    usm_type=res_usm_type,
+                    sycl_queue=exec_q,
+                    order=order,
+                )
+
+        if buf1.shape != x1_broadcast_shape:
+            buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape)
+        if x2.shape != x2_broadcast_shape:
+            x2 = dpt.broadcast_to(x2, x2_broadcast_shape)
+        ht_dot_ev, dot_ev = tli._dot(
+            x1=buf1,
+            x2=x2,
+            batch_dims=len(res_shape[:-2]),
+            x1_outer_dims=1,
+            x2_outer_dims=1,
+            inner_dims=1,
+            dst=out,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_dot_ev, dot_ev)
+        if not (orig_out is None or orig_out is out):
+            # Copy the out data from temporary buffer to original memory
+            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out,
+                dst=orig_out,
+                sycl_queue=exec_q,
+                depends=[dot_ev],
+            )
+            _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+            out = orig_out
+        if appended_axes:
+            out = dpt.squeeze(out, tuple(appended_axes))
+        return out
+
+    if order == "K":
+        if x1.flags.c_contiguous and x2.flags.c_contiguous:
+            order = "C"
+        elif x1.flags.f_contiguous and x2.flags.f_contiguous:
+            order = "F"
+    if order == "K":
+        buf1 = _empty_like_orderK(x1, buf1_dt)
+    else:
+        buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order)
+    deps_ev = _manager.submitted_events
+    ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+    if order == "K":
+        buf2 = _empty_like_orderK(x2, buf2_dt)
+    else:
+        buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order)
+    ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
+    )
+    _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+    if out is None:
+        if order == "K":
+            out = _empty_like_pair_orderK(
+                buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
+            )
+        else:
+            out = dpt.empty(
+                res_shape,
+                dtype=res_dt,
+                usm_type=res_usm_type,
+                sycl_queue=exec_q,
+                order=order,
+            )
+
+    if buf1.shape != x1_broadcast_shape:
+        buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape)
+    if buf2.shape != x2_broadcast_shape:
+        buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape)
+    ht_, dot_ev = tli._dot(
+        x1=buf1,
+        x2=buf2,
+        batch_dims=len(res_shape[:-2]),
+        x1_outer_dims=1,
+        x2_outer_dims=1,
+        inner_dims=1,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=[copy1_ev, copy2_ev],
+    )
+    _manager.add_event_pair(ht_, dot_ev)
+    if appended_axes:
+        out = dpt.squeeze(out, tuple(appended_axes))
+    return out
diff --git a/dpnp/tensor/_manipulation_functions.py b/dpnp/tensor/_manipulation_functions.py
new file mode 100644
index 000000000000..7347f62de115
--- /dev/null
+++ b/dpnp/tensor/_manipulation_functions.py
@@ -0,0 +1,1094 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+import operator
+
+import numpy as np
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+
+from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
+from ._type_utils import _supported_dtype, _to_device_supported_dtype
+
+__doc__ = (
+    "Implementation module for array manipulation "
+    "functions in :module:`dpctl.tensor`"
+)
+
+
+def _arrays_validation(arrays, check_ndim=True):
+    n = len(arrays)
+    if n == 0:
+        raise TypeError("Missing 1 required positional argument: 'arrays'.")
+
+    if not isinstance(arrays, (list, tuple)):
+        raise TypeError(f"Expected tuple or list type, got {type(arrays)}.")
+
+    for X in arrays:
+        if not isinstance(X, dpt.usm_ndarray):
+            raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    exec_q = dpt.get_execution_queue([X.sycl_queue for X in arrays])
+    if exec_q is None:
+        raise ValueError("All the input arrays must have same sycl queue.")
+
+    res_usm_type = dpt.get_coerced_usm_type([X.usm_type for X in arrays])
+    if res_usm_type is None:
+        raise ValueError("All the input arrays must have usm_type.")
+
+    X0 = arrays[0]
+    _supported_dtype(Xi.dtype for Xi in arrays)
+
+    res_dtype = X0.dtype
+    dev = exec_q.sycl_device
+    for i in range(1, n):
+        res_dtype = np.promote_types(res_dtype, arrays[i])
+        res_dtype = _to_device_supported_dtype(res_dtype, dev)
+
+    if check_ndim:
+        for i in range(1, n):
+            if X0.ndim != arrays[i].ndim:
+                raise ValueError(
+                    "All the input arrays must have same number of dimensions, "
+                    f"but the array at index 0 has {X0.ndim} dimension(s) and "
+                    f"the array at index {i} has {arrays[i].ndim} dimension(s)."
+                )
+    return res_dtype, res_usm_type, exec_q
+
+
+def _broadcast_shapes(*args):
+    """
+    Broadcast the input shapes into a single shape;
+    returns tuple broadcasted shape.
+    """
+    array_shapes = [array.shape for array in args]
+    return _broadcast_shape_impl(array_shapes)
+
+
+def _broadcast_shape_impl(shapes):
+    if len(set(shapes)) == 1:
+        return shapes[0]
+    mutable_shapes = False
+    nds = [len(s) for s in shapes]
+    biggest = max(nds)
+    sh_len = len(shapes)
+    for i in range(sh_len):
+        diff = biggest - nds[i]
+        if diff > 0:
+            ty = type(shapes[i])
+            shapes[i] = ty(
+                itertools.chain(itertools.repeat(1, diff), shapes[i])
+            )
+    common_shape = []
+    for axis in range(biggest):
+        lengths = [s[axis] for s in shapes]
+        unique = set(lengths + [1])
+        if len(unique) > 2:
+            raise ValueError(
+                "Shape mismatch: two or more arrays have "
+                f"incompatible dimensions on axis ({axis},)"
+            )
+        elif len(unique) == 2:
+            unique.remove(1)
+            new_length = unique.pop()
+            common_shape.append(new_length)
+            for i in range(sh_len):
+                if shapes[i][axis] == 1:
+                    if not mutable_shapes:
+                        shapes = [list(s) for s in shapes]
+                        mutable_shapes = True
+                    shapes[i][axis] = new_length
+        else:
+            common_shape.append(1)
+
+    return tuple(common_shape)
+
+
+def _broadcast_strides(X_shape, X_strides, res_ndim):
+    """
+    Broadcasts strides to match the given dimensions;
+    returns tuple type strides.
+    """
+    out_strides = [0] * res_ndim
+    X_shape_len = len(X_shape)
+    str_dim = -X_shape_len
+    for i in range(X_shape_len):
+        shape_value = X_shape[i]
+        if not shape_value == 1:
+            out_strides[str_dim] = X_strides[i]
+        str_dim += 1
+
+    return tuple(out_strides)
+
+
+def _check_same_shapes(X0_shape, axis, n, arrays):
+    for i in range(1, n):
+        Xi_shape = arrays[i].shape
+        for j, X0j in enumerate(X0_shape):
+            if X0j != Xi_shape[j] and j != axis:
+                raise ValueError(
+                    "All the input array dimensions for the concatenation "
+                    f"axis must match exactly, but along dimension {j}, the "
+                    f"array at index 0 has size {X0j} and the array "
+                    f"at index {i} has size {Xi_shape[j]}."
+                )
+
+
+def _concat_axis_None(arrays):
+    """Implementation of concat(arrays, axis=None)."""
+    res_dtype, res_usm_type, exec_q = _arrays_validation(
+        arrays, check_ndim=False
+    )
+    res_shape = 0
+    for array in arrays:
+        res_shape += array.size
+    res = dpt.empty(
+        res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+
+    fill_start = 0
+    _manager = SequentialOrderManager[exec_q]
+    deps = _manager.submitted_events
+    for array in arrays:
+        fill_end = fill_start + array.size
+        if array.flags.c_contiguous:
+            hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=dpt.reshape(array, -1),
+                dst=res[fill_start:fill_end],
+                sycl_queue=exec_q,
+                depends=deps,
+            )
+            _manager.add_event_pair(hev, cpy_ev)
+        else:
+            src_ = array
+            # _copy_usm_ndarray_for_reshape requires src and dst to have
+            # the same data type
+            if not array.dtype == res_dtype:
+                src2_ = dpt.empty_like(src_, dtype=res_dtype)
+                ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=src_, dst=src2_, sycl_queue=exec_q, depends=deps
+                )
+                _manager.add_event_pair(ht_copy_ev, cpy_ev)
+                hev, reshape_copy_ev = ti._copy_usm_ndarray_for_reshape(
+                    src=src2_,
+                    dst=res[fill_start:fill_end],
+                    sycl_queue=exec_q,
+                    depends=[cpy_ev],
+                )
+                _manager.add_event_pair(hev, reshape_copy_ev)
+            else:
+                hev, cpy_ev = ti._copy_usm_ndarray_for_reshape(
+                    src=src_,
+                    dst=res[fill_start:fill_end],
+                    sycl_queue=exec_q,
+                    depends=deps,
+                )
+                _manager.add_event_pair(hev, cpy_ev)
+        fill_start = fill_end
+
+    return res
+
+
+def broadcast_arrays(*args):
+    """broadcast_arrays(*arrays)
+
+    Broadcasts one or more :class:`dpctl.tensor.usm_ndarrays` against
+    one another.
+
+    Args:
+        arrays (usm_ndarray): an arbitrary number of arrays to be
+            broadcasted.
+
+    Returns:
+        List[usm_ndarray]:
+            A list of broadcasted arrays. Each array
+            must have the same shape. Each array must have the same `dtype`,
+            `device` and `usm_type` attributes as its corresponding input
+            array.
+    """
+    if len(args) == 0:
+        raise ValueError("`broadcast_arrays` requires at least one argument")
+    for X in args:
+        if not isinstance(X, dpt.usm_ndarray):
+            raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    shape = _broadcast_shapes(*args)
+
+    if all(X.shape == shape for X in args):
+        return args
+
+    return [broadcast_to(X, shape) for X in args]
+
+
+def broadcast_to(X, /, shape):
+    """broadcast_to(x, shape)
+
+    Broadcast an array to a new `shape`; returns the broadcasted
+    :class:`dpctl.tensor.usm_ndarray` as a view.
+
+    Args:
+        x (usm_ndarray): input array
+        shape (Tuple[int,...]): array shape. The `shape` must be
+            compatible with `x` according to broadcasting rules.
+
+    Returns:
+        usm_ndarray:
+            An array with the specified `shape`.
+            The output array is a view of the input array, and
+            hence has the same data type, USM allocation type and
+            device attributes.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    # Use numpy.broadcast_to to check the validity of the input
+    # parameter 'shape'. Raise ValueError if 'X' is not compatible
+    # with 'shape' according to NumPy's broadcasting rules.
+    new_array = np.broadcast_to(
+        np.broadcast_to(np.empty(tuple(), dtype="u1"), X.shape), shape
+    )
+    new_sts = _broadcast_strides(X.shape, X.strides, new_array.ndim)
+    return dpt.usm_ndarray(
+        shape=new_array.shape,
+        dtype=X.dtype,
+        buffer=X,
+        strides=new_sts,
+        offset=X._element_offset,
+    )
+
+
+def concat(arrays, /, *, axis=0):
+    """concat(arrays, axis)
+
+    Joins a sequence of arrays along an existing axis.
+
+    Args:
+        arrays (Union[List[usm_ndarray, Tuple[usm_ndarray,...]]]):
+            input arrays to join. The arrays must have the same shape,
+            except in the dimension specified by `axis`.
+        axis (Optional[int]): axis along which the arrays will be joined.
+            If `axis` is `None`, arrays must be flattened before
+            concatenation. If `axis` is negative, it is understood as
+            being counted from the last dimension. Default: `0`.
+
+    Returns:
+        usm_ndarray:
+            An output array containing the concatenated
+            values. The output array data type is determined by Type
+            Promotion Rules of array API.
+
+    All input arrays must have the same device attribute. The output array
+    is allocated on that same device, and data movement operations are
+    scheduled on a queue underlying the device. The USM allocation type
+    of the output array is determined by USM allocation type promotion
+    rules.
+    """
+    if axis is None:
+        return _concat_axis_None(arrays)
+
+    res_dtype, res_usm_type, exec_q = _arrays_validation(arrays)
+    n = len(arrays)
+    X0 = arrays[0]
+
+    axis = normalize_axis_index(axis, X0.ndim)
+    X0_shape = X0.shape
+    _check_same_shapes(X0_shape, axis, n, arrays)
+
+    res_shape_axis = 0
+    for X in arrays:
+        res_shape_axis = res_shape_axis + X.shape[axis]
+
+    res_shape = tuple(
+        X0_shape[i] if i != axis else res_shape_axis for i in range(X0.ndim)
+    )
+
+    res = dpt.empty(
+        res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+
+    _manager = SequentialOrderManager[exec_q]
+    deps = _manager.submitted_events
+    fill_start = 0
+    for i in range(n):
+        fill_end = fill_start + arrays[i].shape[axis]
+        c_shapes_copy = tuple(
+            np.s_[fill_start:fill_end] if j == axis else np.s_[:]
+            for j in range(X0.ndim)
+        )
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arrays[i],
+            dst=res[c_shapes_copy],
+            sycl_queue=exec_q,
+            depends=deps,
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+        fill_start = fill_end
+
+    return res
+
+
+def expand_dims(X, /, *, axis=0):
+    """expand_dims(x, axis)
+
+    Expands the shape of an array by inserting a new axis (dimension)
+    of size one at the position specified by axis.
+
+    Args:
+        x (usm_ndarray):
+            input array
+        axis (Union[int, Tuple[int]]):
+            axis position in the expanded axes (zero-based). If `x` has rank
+            (i.e, number of dimensions) `N`, a valid `axis` must reside
+            in the closed-interval `[-N-1, N]`. If provided a negative
+            `axis`, the `axis` position at which to insert a singleton
+            dimension is computed as `N + axis + 1`. Hence, if
+            provided `-1`, the resolved axis position is `N` (i.e.,
+            a singleton dimension must be appended to the input array `x`).
+            If provided `-N-1`, the resolved axis position is `0` (i.e., a
+            singleton dimension is prepended to the input array `x`).
+
+    Returns:
+        usm_ndarray:
+            Returns a view, if possible, and a copy otherwise with the number
+            of dimensions increased.
+            The expanded array has the same data type as the input array `x`.
+            The expanded array is located on the same device as the input
+            array, and has the same USM allocation type.
+
+    Raises:
+        IndexError: if `axis` value is invalid.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    if type(axis) not in (tuple, list):
+        axis = (axis,)
+
+    out_ndim = len(axis) + X.ndim
+    axis = normalize_axis_tuple(axis, out_ndim)
+
+    shape_it = iter(X.shape)
+    shape = tuple(1 if ax in axis else next(shape_it) for ax in range(out_ndim))
+
+    return dpt.reshape(X, shape)
+
+
+def flip(X, /, *, axis=None):
+    """flip(x, axis)
+
+    Reverses the order of elements in an array `x` along the given `axis`.
+    The shape of the array is preserved, but the elements are reordered.
+
+    Args:
+        x (usm_ndarray): input array.
+        axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along
+            which to flip.
+            If `axis` is `None`, all input array axes are flipped.
+            If `axis` is negative, the flipped axis is counted from the
+            last dimension. If provided more than one axis, only the specified
+            axes are flipped. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            A view of `x` with the entries of `axis` reversed.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+    X_ndim = X.ndim
+    if axis is None:
+        indexer = (np.s_[::-1],) * X_ndim
+    else:
+        axis = normalize_axis_tuple(axis, X_ndim)
+        indexer = tuple(
+            np.s_[::-1] if i in axis else np.s_[:] for i in range(X.ndim)
+        )
+    return X[indexer]
+
+
+def moveaxis(X, source, destination, /):
+    """moveaxis(x, source, destination)
+
+    Moves axes of an array to new positions.
+
+    Args:
+        x (usm_ndarray): input array
+
+        source (int or a sequence of int):
+            Original positions of the axes to move.
+            These must be unique. If `x` has rank (i.e., number of
+            dimensions) `N`, a valid `axis` must be in the
+            half-open interval `[-N, N)`.
+
+        destination (int or a sequence of int):
+            Destination positions for each of the original axes.
+            These must also be unique. If `x` has rank
+            (i.e., number of dimensions) `N`, a valid `axis` must be
+            in the half-open interval `[-N, N)`.
+
+    Returns:
+        usm_ndarray:
+            Array with moved axes.
+            The returned array must has the same data type as `x`,
+            is created on the same device as `x` and has the same
+            USM allocation type as `x`.
+
+    Raises:
+        AxisError: if `axis` value is invalid.
+        ValueError: if `src` and `dst` have not equal number of elements.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    source = normalize_axis_tuple(source, X.ndim, "source")
+    destination = normalize_axis_tuple(destination, X.ndim, "destination")
+
+    if len(source) != len(destination):
+        raise ValueError(
+            "`source` and `destination` arguments must have "
+            "the same number of elements"
+        )
+
+    ind = [n for n in range(X.ndim) if n not in source]
+
+    for src, dst in sorted(zip(destination, source)):
+        ind.insert(src, dst)
+
+    return dpt.permute_dims(X, tuple(ind))
+
+
+def permute_dims(X, /, axes):
+    """permute_dims(x, axes)
+
+    Permute the axes (dimensions) of an array; returns the permuted
+    array as a view.
+
+    Args:
+        x (usm_ndarray): input array.
+        axes (Tuple[int, ...]): tuple containing permutation of
+           `(0,1,...,N-1)` where `N` is the number of axes (dimensions)
+           of `x`.
+    Returns:
+        usm_ndarray:
+            An array with permuted axes.
+            The returned array must has the same data type as `x`,
+            is created on the same device as `x` and has the same USM allocation
+            type as `x`.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+    axes = normalize_axis_tuple(axes, X.ndim, "axes")
+    if not X.ndim == len(axes):
+        raise ValueError(
+            "The length of the passed axes does not match "
+            "to the number of usm_ndarray dimensions."
+        )
+    newstrides = tuple(X.strides[i] for i in axes)
+    newshape = tuple(X.shape[i] for i in axes)
+    return dpt.usm_ndarray(
+        shape=newshape,
+        dtype=X.dtype,
+        buffer=X,
+        strides=newstrides,
+        offset=X._element_offset,
+    )
+
+
+def repeat(x, repeats, /, *, axis=None):
+    """repeat(x, repeats, axis=None)
+
+    Repeat elements of an array on a per-element basis.
+
+    Args:
+        x (usm_ndarray): input array
+
+        repeats (Union[int, Sequence[int, ...], usm_ndarray]):
+            The number of repetitions for each element.
+
+            `repeats` must be broadcast-compatible with `N` where `N` is
+            `prod(x.shape)` if `axis` is `None` and `x.shape[axis]`
+            otherwise.
+
+            If `repeats` is an array, it must have an integer data type.
+            Otherwise, `repeats` must be a Python integer or sequence of
+            Python integers (i.e., a tuple, list, or range).
+
+        axis (Optional[int]):
+            The axis along which to repeat values. If `axis` is `None`, the
+            function repeats elements of the flattened array. Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            output array with repeated elements.
+
+            If `axis` is `None`, the returned array is one-dimensional,
+            otherwise, it has the same shape as `x`, except for the axis along
+            which elements were repeated.
+
+            The returned array will have the same data type as `x`.
+            The returned array will be located on the same device as `x` and
+            have the same USM allocation type as `x`.
+
+    Raises:
+        AxisError: if `axis` value is invalid.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
+
+    x_ndim = x.ndim
+    x_shape = x.shape
+    if axis is not None:
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
+        axis_size = x_shape[axis]
+    else:
+        axis_size = x.size
+
+    scalar = False
+    if isinstance(repeats, int):
+        if repeats < 0:
+            raise ValueError("`repeats` must be a positive integer")
+        usm_type = x.usm_type
+        exec_q = x.sycl_queue
+        scalar = True
+    elif isinstance(repeats, dpt.usm_ndarray):
+        if repeats.ndim > 1:
+            raise ValueError(
+                "`repeats` array must be 0- or 1-dimensional, got "
+                f"{repeats.ndim}"
+            )
+        exec_q = dpt.get_execution_queue((x.sycl_queue, repeats.sycl_queue))
+        if exec_q is None:
+            raise dpt.ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        usm_type = dpt.get_coerced_usm_type(
+            (
+                x.usm_type,
+                repeats.usm_type,
+            )
+        )
+        dpt.validate_usm_type(usm_type, allow_none=False)
+        if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"):
+            raise TypeError(
+                f"'repeats' data type {repeats.dtype} cannot be cast to "
+                "'int64' according to the casting rule ''safe.''"
+            )
+        if repeats.size == 1:
+            scalar = True
+            # bring the single element to the host
+            if repeats.ndim == 0:
+                repeats = int(repeats)
+            else:
+                # Get the single element explicitly
+                # since non-0D arrays can not be converted to scalars
+                repeats = int(repeats[0])
+            if repeats < 0:
+                raise ValueError("`repeats` elements must be positive")
+        else:
+            if repeats.size != axis_size:
+                raise ValueError(
+                    "'repeats' array must be broadcastable to the size of "
+                    "the repeated axis"
+                )
+            if not dpt.all(repeats >= 0):
+                raise ValueError("'repeats' elements must be positive")
+
+    elif isinstance(repeats, (tuple, list, range)):
+        usm_type = x.usm_type
+        exec_q = x.sycl_queue
+
+        len_reps = len(repeats)
+        if len_reps == 1:
+            repeats = repeats[0]
+            if repeats < 0:
+                raise ValueError("`repeats` elements must be positive")
+            scalar = True
+        else:
+            if len_reps != axis_size:
+                raise ValueError(
+                    "`repeats` sequence must have the same length as the "
+                    "repeated axis"
+                )
+            repeats = dpt.asarray(
+                repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q
+            )
+            if not dpt.all(repeats >= 0):
+                raise ValueError("`repeats` elements must be positive")
+    else:
+        raise TypeError(
+            "Expected int, sequence, or `usm_ndarray` for second argument,"
+            f"got {type(repeats)}"
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if scalar:
+        res_axis_size = repeats * axis_size
+        if axis is not None:
+            res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+        else:
+            res_shape = (res_axis_size,)
+        res = dpt.empty(
+            res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
+        )
+        if res_axis_size > 0:
+            ht_rep_ev, rep_ev = ti._repeat_by_scalar(
+                src=x,
+                dst=res,
+                reps=repeats,
+                axis=axis,
+                sycl_queue=exec_q,
+                depends=dep_evs,
+            )
+            _manager.add_event_pair(ht_rep_ev, rep_ev)
+    else:
+        if repeats.dtype != dpt.int64:
+            rep_buf = dpt.empty(
+                repeats.shape,
+                dtype=dpt.int64,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=repeats, dst=rep_buf, sycl_queue=exec_q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_copy_ev, copy_ev)
+            cumsum = dpt.empty(
+                (axis_size,),
+                dtype=dpt.int64,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            # _cumsum_1d synchronizes so `depends` ends here safely
+            res_axis_size = ti._cumsum_1d(
+                rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev]
+            )
+            if axis is not None:
+                res_shape = (
+                    x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+                )
+            else:
+                res_shape = (res_axis_size,)
+            res = dpt.empty(
+                res_shape,
+                dtype=x.dtype,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            if res_axis_size > 0:
+                ht_rep_ev, rep_ev = ti._repeat_by_sequence(
+                    src=x,
+                    dst=res,
+                    reps=rep_buf,
+                    cumsum=cumsum,
+                    axis=axis,
+                    sycl_queue=exec_q,
+                )
+                _manager.add_event_pair(ht_rep_ev, rep_ev)
+        else:
+            cumsum = dpt.empty(
+                (axis_size,),
+                dtype=dpt.int64,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            res_axis_size = ti._cumsum_1d(
+                repeats, cumsum, sycl_queue=exec_q, depends=dep_evs
+            )
+            if axis is not None:
+                res_shape = (
+                    x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+                )
+            else:
+                res_shape = (res_axis_size,)
+            res = dpt.empty(
+                res_shape,
+                dtype=x.dtype,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            if res_axis_size > 0:
+                ht_rep_ev, rep_ev = ti._repeat_by_sequence(
+                    src=x,
+                    dst=res,
+                    reps=repeats,
+                    cumsum=cumsum,
+                    axis=axis,
+                    sycl_queue=exec_q,
+                )
+                _manager.add_event_pair(ht_rep_ev, rep_ev)
+    return res
+
+
+def roll(x, /, shift, *, axis=None):
+    """
+    roll(x, shift, axis)
+
+    Rolls array elements along a specified axis.
+    Array elements that roll beyond the last position are re-introduced
+    at the first position. Array elements that roll beyond the first position
+    are re-introduced at the last position.
+
+    Args:
+        x (usm_ndarray): input array
+        shift (Union[int, Tuple[int,...]]): number of places by which the
+            elements are shifted. If `shift` is a tuple, then `axis` must be a
+            tuple of the same size, and each of the given axes must be shifted
+            by the corresponding element in `shift`. If `shift` is an `int`
+            and `axis` a tuple, then the same `shift` must be used for all
+            specified axes. If a `shift` is positive, then array elements is
+            shifted positively (toward larger indices) along the dimension of
+            `axis`.
+            If a `shift` is negative, then array elements must be shifted
+            negatively (toward smaller indices) along the dimension of `axis`.
+        axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along which
+            elements to shift. If `axis` is `None`, the array is
+            flattened, shifted, and then restored to its original shape.
+            Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            An array having the same `dtype`, `usm_type` and
+            `device` attributes as `x` and whose elements are shifted relative
+            to `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
+    exec_q = x.sycl_queue
+    _manager = SequentialOrderManager[exec_q]
+    if axis is None:
+        shift = operator.index(shift)
+        res = dpt.empty(
+            x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+        )
+        sz = operator.index(x.size)
+        shift = (shift % sz) if sz > 0 else 0
+        dep_evs = _manager.submitted_events
+        hev, roll_ev = ti._copy_usm_ndarray_for_roll_1d(
+            src=x,
+            dst=res,
+            shift=shift,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(hev, roll_ev)
+        return res
+    axis = normalize_axis_tuple(axis, x.ndim, allow_duplicate=True)
+    broadcasted = np.broadcast(shift, axis)
+    if broadcasted.ndim > 1:
+        raise ValueError("'shift' and 'axis' should be scalars or 1D sequences")
+    shifts = [
+        0,
+    ] * x.ndim
+    shape = x.shape
+    for sh, ax in broadcasted:
+        n_i = operator.index(shape[ax])
+        shifted = shifts[ax] + operator.index(sh)
+        shifts[ax] = (shifted % n_i) if n_i > 0 else 0
+    res = dpt.empty(
+        x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    dep_evs = _manager.submitted_events
+    ht_e, roll_ev = ti._copy_usm_ndarray_for_roll_nd(
+        src=x, dst=res, shifts=shifts, sycl_queue=exec_q, depends=dep_evs
+    )
+    _manager.add_event_pair(ht_e, roll_ev)
+    return res
+
+
+def squeeze(X, /, axis=None):
+    """squeeze(x, axis)
+
+    Removes singleton dimensions (axes) from array `x`.
+
+    Args:
+        x (usm_ndarray): input array
+        axis (Union[int, Tuple[int,...]]): axis (or axes) to squeeze.
+
+    Returns:
+        usm_ndarray:
+            Output array is a view, if possible,
+            and a copy otherwise, but with all or a subset of the
+            dimensions of length 1 removed. Output has the same data
+            type as the input, is allocated on the same device as the
+            input and has the same USM allocation type as the input
+            array `x`.
+
+    Raises:
+        ValueError: if the specified axis has a size greater than one.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+    X_shape = X.shape
+    if axis is not None:
+        axis = normalize_axis_tuple(axis, X.ndim if X.ndim != 0 else X.ndim + 1)
+        new_shape = []
+        for i, x in enumerate(X_shape):
+            if i not in axis:
+                new_shape.append(x)
+            else:
+                if x != 1:
+                    raise ValueError(
+                        "Cannot select an axis to squeeze out "
+                        "which has size not equal to one."
+                    )
+        new_shape = tuple(new_shape)
+    else:
+        new_shape = tuple(axis for axis in X_shape if axis != 1)
+    if new_shape == X.shape:
+        return X
+    else:
+        return dpt.reshape(X, new_shape)
+
+
+def stack(arrays, /, *, axis=0):
+    """
+    stack(arrays, axis)
+
+    Joins a sequence of arrays along a new axis.
+
+    Args:
+        arrays (Union[List[usm_ndarray], Tuple[usm_ndarray,...]]):
+            input arrays to join. Each array must have the same shape.
+        axis (int): axis along which the arrays will be joined. Providing
+            an `axis` specified the index of the new axis in the dimensions
+            of the output array. A valid axis must be on the interval
+            `[-N, N)`, where `N` is the rank (number of dimensions) of `x`.
+            Default: `0`.
+
+    Returns:
+        usm_ndarray:
+            An output array having rank `N+1`, where `N` is
+            the rank (number of dimensions) of `x`. If the input arrays have
+            different data types, array API Type Promotion Rules apply.
+
+    Raises:
+        ValueError: if not all input arrays have the same shape
+        IndexError: if provided an `axis` outside of the required interval.
+    """
+    res_dtype, res_usm_type, exec_q = _arrays_validation(arrays)
+
+    n = len(arrays)
+    X0 = arrays[0]
+    res_ndim = X0.ndim + 1
+    axis = normalize_axis_index(axis, res_ndim)
+    X0_shape = X0.shape
+
+    for i in range(1, n):
+        if X0_shape != arrays[i].shape:
+            raise ValueError("All input arrays must have the same shape")
+
+    res_shape = tuple(
+        X0_shape[i - 1 * (i >= axis)] if i != axis else n
+        for i in range(res_ndim)
+    )
+
+    res = dpt.empty(
+        res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
+    )
+
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    for i in range(n):
+        c_shapes_copy = tuple(
+            i if j == axis else np.s_[:] for j in range(res_ndim)
+        )
+        _dst = res[c_shapes_copy]
+        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arrays[i], dst=_dst, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cpy_ev)
+
+    return res
+
+
+def swapaxes(X, axis1, axis2):
+    """swapaxes(x, axis1, axis2)
+
+    Interchanges two axes of an array.
+
+    Args:
+        x (usm_ndarray): input array
+
+        axis1 (int): First axis.
+            If `x` has rank (i.e., number of dimensions) `N`,
+            a valid `axis` must be in the half-open interval `[-N, N)`.
+
+        axis2 (int): Second axis.
+            If `x` has rank (i.e., number of dimensions) `N`,
+            a valid `axis` must be in the half-open interval `[-N, N)`.
+
+    Returns:
+        usm_ndarray:
+            Array with swapped axes.
+            The returned array must has the same data type as `x`,
+            is created on the same device as `x` and has the same USM
+            allocation type as `x`.
+
+    Raises:
+        AxisError: if `axis` value is invalid.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    axis1 = normalize_axis_index(axis1, X.ndim, "axis1")
+    axis2 = normalize_axis_index(axis2, X.ndim, "axis2")
+
+    ind = list(range(0, X.ndim))
+    ind[axis1] = axis2
+    ind[axis2] = axis1
+    return dpt.permute_dims(X, tuple(ind))
+
+
+def unstack(X, /, *, axis=0):
+    """unstack(x, axis=0)
+
+    Splits an array in a sequence of arrays along the given axis.
+
+    Args:
+        x (usm_ndarray): input array
+
+        axis (int, optional): axis along which `x` is unstacked.
+            If `x` has rank (i.e, number of dimensions) `N`,
+            a valid `axis` must reside in the half-open interval `[-N, N)`.
+            Default: `0`.
+
+    Returns:
+        Tuple[usm_ndarray,...]:
+            Output sequence of arrays which are views into the input array.
+
+    Raises:
+        AxisError: if the `axis` value is invalid.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
+
+    axis = normalize_axis_index(axis, X.ndim)
+    Y = dpt.moveaxis(X, axis, 0)
+
+    return tuple(Y[i] for i in range(Y.shape[0]))
+
+
+def tile(x, repetitions, /):
+    """tile(x, repetitions)
+
+    Repeat an input array `x` along each axis a number of times given by
+    `repetitions`.
+
+    For `N` = len(`repetitions`) and `M` = len(`x.shape`):
+
+        * If `M < N`, `x` will have `N - M` new axes prepended to its shape
+        * If `M > N`, `repetitions` will have `M - N` ones prepended to it
+
+    Args:
+        x (usm_ndarray): input array
+
+        repetitions (Union[int, Tuple[int, ...]]):
+            The number of repetitions along each dimension of `x`.
+
+    Returns:
+        usm_ndarray:
+            tiled output array.
+
+            The returned array will have rank `max(M, N)`. If `S` is the
+            shape of `x` after prepending dimensions and `R` is
+            `repetitions` after prepending ones, then the shape of the
+            result will be `S[i] * R[i]` for each dimension `i`.
+
+            The returned array will have the same data type as `x`.
+            The returned array will be located on the same device as `x` and
+            have the same USM allocation type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
+
+    if not isinstance(repetitions, tuple):
+        if isinstance(repetitions, int):
+            repetitions = (repetitions,)
+        else:
+            raise TypeError(
+                f"Expected tuple or integer type, got {type(repetitions)}."
+            )
+
+    rep_dims = len(repetitions)
+    x_dims = x.ndim
+    if rep_dims < x_dims:
+        repetitions = (x_dims - rep_dims) * (1,) + repetitions
+    elif x_dims < rep_dims:
+        x = dpt.reshape(x, (rep_dims - x_dims) * (1,) + x.shape)
+    res_shape = tuple(map(lambda sh, rep: sh * rep, x.shape, repetitions))
+    # case of empty input
+    if x.size == 0:
+        return dpt.empty(
+            res_shape,
+            dtype=x.dtype,
+            usm_type=x.usm_type,
+            sycl_queue=x.sycl_queue,
+        )
+    in_sh = x.shape
+    if res_shape == in_sh:
+        return dpt.copy(x)
+    expanded_sh = []
+    broadcast_sh = []
+    out_sz = 1
+    for i in range(len(res_shape)):
+        out_sz *= res_shape[i]
+        reps, sh = repetitions[i], in_sh[i]
+        if reps == 1:
+            # dimension will be unchanged
+            broadcast_sh.append(sh)
+            expanded_sh.append(sh)
+        elif sh == 1:
+            # dimension will be broadcast
+            broadcast_sh.append(reps)
+            expanded_sh.append(sh)
+        else:
+            broadcast_sh.extend([reps, sh])
+            expanded_sh.extend([1, sh])
+    exec_q = x.sycl_queue
+    xdt = x.dtype
+    xut = x.usm_type
+    res = dpt.empty((out_sz,), dtype=xdt, usm_type=xut, sycl_queue=exec_q)
+    # no need to copy data for empty output
+    if out_sz > 0:
+        x = dpt.broadcast_to(
+            # this reshape should never copy
+            dpt.reshape(x, expanded_sh),
+            broadcast_sh,
+        )
+        # copy broadcast input into flat array
+        _manager = SequentialOrderManager[exec_q]
+        dep_evs = _manager.submitted_events
+        hev, cp_ev = ti._copy_usm_ndarray_for_reshape(
+            src=x, dst=res, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(hev, cp_ev)
+    return dpt.reshape(res, res_shape)
diff --git a/dpnp/tensor/_numpy_helper.py b/dpnp/tensor/_numpy_helper.py
new file mode 100644
index 000000000000..4ad735823cb3
--- /dev/null
+++ b/dpnp/tensor/_numpy_helper.py
@@ -0,0 +1,45 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+import numpy as np
+
+_npver = np.lib.NumpyVersion(np.__version__)
+
+if _npver < "1.25.0":  # pragma: no cover
+    from numpy import AxisError
+else:
+    from numpy.exceptions import AxisError
+
+if _npver >= "2.0.0":
+    from numpy._core.numeric import normalize_axis_index, normalize_axis_tuple
+else:  # pragma: no cover
+    from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple
+
+
+__all__ = ["AxisError", "normalize_axis_index", "normalize_axis_tuple"]
diff --git a/dpnp/tensor/_print.py b/dpnp/tensor/_print.py
new file mode 100644
index 000000000000..e39bf9041485
--- /dev/null
+++ b/dpnp/tensor/_print.py
@@ -0,0 +1,501 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import contextlib
+import itertools
+import operator
+
+import dpctl
+import numpy as np
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+
+__doc__ = "Print functions for :class:`dpctl.tensor.usm_ndarray`."
+
+_print_options = {
+    "linewidth": 75,
+    "edgeitems": 3,
+    "threshold": 1000,
+    "precision": 8,
+    "floatmode": "maxprec",
+    "suppress": False,
+    "nanstr": "nan",
+    "infstr": "inf",
+    "sign": "-",
+}
+
+
+def _move_to_next_line(string, s, line_width, prefix):
+    """Move string to next line if it doesn't fit in the current line."""
+    bottom_len = len(s) - (s.rfind("\n") + 1)
+    next_line = bottom_len + len(string) + 1 > line_width
+    string = ",\n" + " " * len(prefix) + string if next_line else ", " + string
+
+    return string
+
+
+def _options_dict(
+    linewidth=None,
+    edgeitems=None,
+    threshold=None,
+    precision=None,
+    floatmode=None,
+    suppress=None,
+    nanstr=None,
+    infstr=None,
+    sign=None,
+    numpy=False,
+):
+    if numpy:
+        numpy_options = np.get_printoptions()
+        options = {k: numpy_options[k] for k in _print_options.keys()}
+    else:
+        options = _print_options.copy()
+
+    if suppress:
+        options["suppress"] = True
+
+    local = dict(locals().items())
+    for int_arg in ["linewidth", "precision", "threshold", "edgeitems"]:
+        val = local[int_arg]
+        if val is not None:
+            options[int_arg] = operator.index(val)
+
+    for str_arg in ["nanstr", "infstr"]:
+        val = local[str_arg]
+        if val is not None:
+            if not isinstance(val, str):
+                raise TypeError(
+                    "`{}` ".format(str_arg) + "must be of `string` type."
+                )
+            options[str_arg] = val
+
+    signs = ["-", "+", " "]
+    if sign is not None:
+        if sign not in signs:
+            raise ValueError(
+                "`sign` must be one of"
+                + ", ".join("`{}`".format(s) for s in signs)
+            )
+        options["sign"] = sign
+
+    floatmodes = ["fixed", "unique", "maxprec", "maxprec_equal"]
+    if floatmode is not None:
+        if floatmode not in floatmodes:
+            raise ValueError(
+                "`floatmode` must be one of"
+                + ", ".join("`{}`".format(m) for m in floatmodes)
+            )
+        options["floatmode"] = floatmode
+
+    return options
+
+
+def set_print_options(
+    linewidth=None,
+    edgeitems=None,
+    threshold=None,
+    precision=None,
+    floatmode=None,
+    suppress=None,
+    nanstr=None,
+    infstr=None,
+    sign=None,
+    numpy=False,
+):
+    """
+    set_print_options(linewidth=None, edgeitems=None, threshold=None,
+                      precision=None, floatmode=None, suppress=None,
+                      nanstr=None, infstr=None, sign=None, numpy=False)
+
+    Set options for printing :class:`dpctl.tensor.usm_ndarray` class.
+
+    Args:
+        linewidth (int, optional):
+            Number of characters printed per line.
+            Raises `TypeError` if linewidth is not an integer.
+            Default: `75`.
+        edgeitems (int, optional):
+            Number of elements at the beginning and end
+            when the printed array is abbreviated.
+            Raises `TypeError` if edgeitems is not an integer.
+            Default: `3`.
+        threshold (int, optional):
+            Number of elements that triggers array abbreviation.
+            Raises `TypeError` if threshold is not an integer.
+            Default: `1000`.
+        precision (int or None, optional):
+            Number of digits printed for floating point numbers.
+            Raises `TypeError` if precision is not an integer.
+            Default: `8`.
+        floatmode (str, optional):
+            Controls how floating point numbers are interpreted.
+                `"fixed:`:
+                    Always prints exactly `precision` digits.
+                `"unique"`:
+                    Ignores precision, prints the number of
+                    digits necessary to uniquely specify each number.
+                `"maxprec"`:
+                    Prints `precision` digits or fewer,
+                    if fewer will uniquely represent a number.
+                `"maxprec_equal"`:
+                    Prints an equal number of digits
+                    for each number. This number is `precision` digits
+                    or fewer, if fewer will uniquely represent each number.
+            Raises `ValueError` if floatmode is not one of
+            `fixed`, `unique`, `maxprec`, or `maxprec_equal`.
+            Default: "maxprec_equal"
+        suppress (bool, optional):
+            If `True,` numbers equal to zero in the current precision
+            will print as zero.
+            Default: `False`.
+        nanstr (str, optional):
+            String used to represent nan.
+            Raises `TypeError` if nanstr is not a string.
+            Default: `"nan"`.
+        infstr (str, optional):
+            String used to represent infinity.
+            Raises `TypeError` if infstr is not a string.
+            Default: `"inf"`.
+        sign (str, optional):
+            Controls the sign of floating point numbers.
+                `"-"`:
+                    Omit the sign of positive numbers.
+                `"+"`:
+                    Always print the sign of positive numbers.
+                `" "`:
+                    Always print a whitespace in place of the
+                    sign of positive numbers.
+            Raises `ValueError` if sign is not one of
+            `"-"`, `"+"`, or `" "`.
+            Default: `"-"`.
+        numpy (bool, optional): If `True,` then before other specified print
+            options are set, a dictionary of Numpy's print options
+            will be used to initialize dpctl's print options.
+            Default: "False"
+    """
+    options = _options_dict(
+        linewidth=linewidth,
+        edgeitems=edgeitems,
+        threshold=threshold,
+        precision=precision,
+        floatmode=floatmode,
+        suppress=suppress,
+        nanstr=nanstr,
+        infstr=infstr,
+        sign=sign,
+        numpy=numpy,
+    )
+    _print_options.update(options)
+
+
+def get_print_options():
+    """get_print_options()
+
+    Returns a copy of current options for printing
+    :class:`dpctl.tensor.usm_ndarray` class.
+
+    Returns:
+        dict: dictionary with array
+           printing option settings.
+
+    Options:
+        - "linewidth" : int, default 75
+        - "edgeitems" : int, default 3
+        - "threshold" : int, default 1000
+        - "precision" : int, default 8
+        - "floatmode" : str, default "maxprec_equal"
+        - "suppress" : bool, default False
+        - "nanstr" : str, default "nan"
+        - "infstr" : str, default "inf"
+        - "sign" : str, default "-"
+    """
+    return _print_options.copy()
+
+
+@contextlib.contextmanager
+def print_options(*args, **kwargs):
+    """
+    Context manager for print options.
+
+    Set print options for the scope of a `with` block.
+    `as` yields dictionary of print options.
+    """
+    options = dpt.get_print_options()
+    try:
+        dpt.set_print_options(*args, **kwargs)
+        yield dpt.get_print_options()
+    finally:
+        dpt.set_print_options(**options)
+
+
+def _nd_corners(arr_in, edge_items):
+    _shape = arr_in.shape
+    max_shape = 2 * edge_items + 1
+    if max(_shape) <= max_shape:
+        return dpt.asnumpy(arr_in)
+    res_shape = tuple(
+        max_shape if _shape[i] > max_shape else _shape[i]
+        for i in range(arr_in.ndim)
+    )
+
+    exec_q = arr_in.sycl_queue
+    arr_out = dpt.empty(
+        res_shape,
+        dtype=arr_in.dtype,
+        usm_type=arr_in.usm_type,
+        sycl_queue=exec_q,
+    )
+
+    blocks = []
+    for i in range(len(_shape)):
+        if _shape[i] > max_shape:
+            blocks.append(
+                (
+                    np.s_[:edge_items],
+                    np.s_[-edge_items:],
+                )
+            )
+        else:
+            blocks.append((np.s_[:],))
+
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    hev_list = []
+    for slc in itertools.product(*blocks):
+        hev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr_in[slc],
+            dst=arr_out[slc],
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        hev_list.append(hev)
+
+    dpctl.SyclEvent.wait_for(hev_list)
+    return dpt.asnumpy(arr_out)
+
+
+def usm_ndarray_str(
+    x,
+    line_width=None,
+    edge_items=None,
+    threshold=None,
+    precision=None,
+    floatmode=None,
+    suppress=None,
+    sign=None,
+    numpy=False,
+    separator=" ",
+    prefix="",
+    suffix="",
+):
+    """
+    usm_ndarray_str(x, line_width=None, edgeitems=None, threshold=None,
+                    precision=None, floatmode=None, suppress=None,
+                    sign=None, numpy=False, separator=" ", prefix="",
+                    suffix="")
+
+    Returns a string representing the elements of a
+    :class:`dpctl.tensor.usm_ndarray`.
+
+    Args:
+        x (usm_ndarray):
+            Input array.
+        line_width (int, optional):
+            Number of characters printed per line.
+            Raises `TypeError` if line_width is not an integer.
+            Default: `75`.
+        edgeitems (int, optional):
+            Number of elements at the beginning and end
+            when the printed array is abbreviated.
+            Raises `TypeError` if edgeitems is not an integer.
+            Default: `3`.
+        threshold (int, optional):
+            Number of elements that triggers array abbreviation.
+            Raises `TypeError` if threshold is not an integer.
+            Default: `1000`.
+        precision (int or None, optional):
+            Number of digits printed for floating point numbers.
+            Raises `TypeError` if precision is not an integer.
+            Default: `8`.
+        floatmode (str, optional):
+            Controls how floating point numbers are interpreted.
+                `"fixed:`:
+                    Always prints exactly `precision` digits.
+                `"unique"`:
+                    Ignores precision, prints the number of
+                    digits necessary to uniquely specify each number.
+                `"maxprec"`:
+                    Prints `precision` digits or fewer,
+                    if fewer will uniquely represent a number.
+                `"maxprec_equal"`:
+                    Prints an equal number of digits for each number.
+                    This number is `precision` digits or fewer,
+                    if fewer will uniquely represent each number.
+            Raises `ValueError` if floatmode is not one of
+            `fixed`, `unique`, `maxprec`, or `maxprec_equal`.
+            Default: "maxprec_equal"
+        suppress (bool, optional):
+            If `True,` numbers equal to zero in the current precision
+            will print as zero.
+            Default: `False`.
+        sign (str, optional):
+            Controls the sign of floating point numbers.
+                `"-"`:
+                    Omit the sign of positive numbers.
+                `"+"`:
+                    Always print the sign of positive numbers.
+                `" "`:
+                    Always print a whitespace in place of the
+                    sign of positive numbers.
+            Raises `ValueError` if sign is not one of
+            `"-"`, `"+"`, or `" "`.
+            Default: `"-"`.
+        numpy (bool, optional):
+            If `True,` then before other specified print
+            options are set, a dictionary of Numpy's print options
+            will be used to initialize dpctl's print options.
+            Default: "False"
+        separator (str, optional):
+            String inserted between elements of the array string.
+            Default: " "
+        prefix (str, optional):
+            String used to determine spacing to the left of the array string.
+            Default: ""
+        suffix (str, optional):
+            String that determines length of the last line of the array string.
+            Default: ""
+
+    Returns:
+        str: string representation of input array.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+
+    options = get_print_options()
+    options.update(
+        _options_dict(
+            linewidth=line_width,
+            edgeitems=edge_items,
+            threshold=threshold,
+            precision=precision,
+            floatmode=floatmode,
+            suppress=suppress,
+            sign=sign,
+            numpy=numpy,
+        )
+    )
+
+    threshold = options["threshold"]
+    edge_items = options["edgeitems"]
+
+    if x.size > threshold:
+        data = _nd_corners(x, edge_items)
+        options["threshold"] = 0
+    else:
+        data = dpt.asnumpy(x)
+    with np.printoptions(**options):
+        s = np.array2string(
+            data, separator=separator, prefix=prefix, suffix=suffix
+        )
+    return s
+
+
+def usm_ndarray_repr(
+    x, line_width=None, precision=None, suppress=None, prefix="usm_ndarray"
+):
+    """
+    usm_ndarray_repr(x, line_width=None, precision=None,
+                     suppress=None, prefix="")
+
+    Returns a formatted string representing the elements
+    of a :class:`dpctl.tensor.usm_ndarray` and its data type,
+    if not a default type.
+
+    Args:
+        x (usm_ndarray): Input array.
+        line_width (int, optional): Number of characters printed per line.
+            Raises `TypeError` if line_width is not an integer.
+            Default: `75`.
+        precision (int or None, optional): Number of digits printed for
+            floating point numbers.
+            Raises `TypeError` if precision is not an integer.
+            Default: `8`.
+        suppress (bool, optional): If `True,` numbers equal to zero
+            in the current precision will print as zero.
+            Default: `False`.
+        prefix (str, optional): String inserted at the start of the array
+            string.
+            Default: ""
+
+    Returns:
+        str: formatted string representing the input array
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+
+    if line_width is None:
+        line_width = _print_options["linewidth"]
+
+    show_dtype = x.dtype not in [
+        dpt.bool,
+        dpt.int64,
+        dpt.float64,
+        dpt.complex128,
+    ]
+
+    prefix = prefix + "("
+    suffix = ")"
+
+    s = usm_ndarray_str(
+        x,
+        line_width=line_width,
+        precision=precision,
+        suppress=suppress,
+        separator=", ",
+        prefix=prefix,
+        suffix=suffix,
+    )
+
+    if show_dtype or x.size == 0:
+        dtype_str = f"dtype={x.dtype.name}"
+        dtype_str = _move_to_next_line(dtype_str, s, line_width, prefix)
+    else:
+        dtype_str = ""
+
+    options = get_print_options()
+    threshold = options["threshold"]
+    if (x.size == 0 and x.shape != (0,)) or x.size > threshold:
+        shape_str = f"shape={x.shape}"
+        shape_str = _move_to_next_line(shape_str, s, line_width, prefix)
+    else:
+        shape_str = ""
+
+    return prefix + s + shape_str + dtype_str + suffix
diff --git a/dpnp/tensor/_reduction.py b/dpnp/tensor/_reduction.py
new file mode 100644
index 000000000000..782fc2b0b442
--- /dev/null
+++ b/dpnp/tensor/_reduction.py
@@ -0,0 +1,830 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+import dpnp.tensor._tensor_reductions_impl as tri
+
+from ._numpy_helper import normalize_axis_tuple
+from ._type_utils import (
+    _default_accumulation_dtype,
+    _default_accumulation_dtype_fp_types,
+    _to_device_supported_dtype,
+)
+
+
+def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+        perm = list(axis)
+        x_tmp = x
+    else:
+        if not isinstance(axis, (tuple, list)):
+            axis = (axis,)
+        axis = normalize_axis_tuple(axis, nd, "axis")
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        x_tmp = dpt.permute_dims(x, perm)
+    red_nd = len(axis)
+    if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]):
+        raise ValueError("reduction cannot be performed over zero-size axes")
+    res_shape = x_tmp.shape[: nd - red_nd]
+    exec_q = x.sycl_queue
+    res_dt = x.dtype
+    res_usm_type = x.usm_type
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        if not keepdims:
+            final_res_shape = res_shape
+        else:
+            inp_shape = x.shape
+            final_res_shape = tuple(
+                inp_shape[i] if i not in axis else 1 for i in range(nd)
+            )
+        if not out.shape == final_res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_shape}, got {out.shape}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if keepdims:
+            out = dpt.squeeze(out, axis=axis)
+            orig_out = out
+        if ti._array_overlap(x, out):
+            out = dpt.empty_like(out)
+    else:
+        out = dpt.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if red_nd == 0:
+        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x_tmp, dst=out, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_cpy, cpy_e)
+        if not (orig_out is None or orig_out is out):
+            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=exec_q, depends=[cpy_e]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+            out = orig_out
+        return out
+
+    hev, red_ev = _reduction_fn(
+        src=x_tmp,
+        trailing_dims_to_reduce=red_nd,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev, red_ev)
+    if not (orig_out is None or orig_out is out):
+        ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev]
+        )
+        _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+        out = orig_out
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm)
+    return out
+
+
+def _reduction_over_axis(
+    x,
+    axis,
+    dtype,
+    keepdims,
+    out,
+    _reduction_fn,
+    _dtype_supported,
+    _default_reduction_type_fn,
+):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+        perm = list(axis)
+        arr = x
+    else:
+        if not isinstance(axis, (tuple, list)):
+            axis = (axis,)
+        axis = normalize_axis_tuple(axis, nd, "axis")
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        arr = dpt.permute_dims(x, perm)
+    red_nd = len(axis)
+    res_shape = arr.shape[: nd - red_nd]
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    if dtype is None:
+        res_dt = _default_reduction_type_fn(inp_dt, q)
+    else:
+        res_dt = dpt.dtype(dtype)
+        res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
+
+    res_usm_type = x.usm_type
+
+    implemented_types = _dtype_supported(inp_dt, res_dt, res_usm_type, q)
+    if dtype is None and not implemented_types:
+        raise RuntimeError(
+            "Automatically determined reduction data type does not "
+            "have direct implementation"
+        )
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        if not keepdims:
+            final_res_shape = res_shape
+        else:
+            inp_shape = x.shape
+            final_res_shape = tuple(
+                inp_shape[i] if i not in axis else 1 for i in range(nd)
+            )
+        if not out.shape == final_res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_shape}, got {out.shape}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+        if dpt.get_execution_queue((q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if keepdims:
+            out = dpt.squeeze(out, axis=axis)
+            orig_out = out
+        if ti._array_overlap(x, out) and implemented_types:
+            out = dpt.empty_like(out)
+    else:
+        out = dpt.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+
+    _manager = SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    if red_nd == 0:
+        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=out, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_cpy, cpy_e)
+        if not (orig_out is None or orig_out is out):
+            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=q, depends=[cpy_e]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+            out = orig_out
+        return out
+
+    if implemented_types:
+        ht_e, red_e = _reduction_fn(
+            src=arr,
+            trailing_dims_to_reduce=red_nd,
+            dst=out,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_e, red_e)
+        if not (orig_out is None or orig_out is out):
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=out, dst=orig_out, sycl_queue=q, depends=[red_e]
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            out = orig_out
+    else:
+        if _dtype_supported(res_dt, res_dt, res_usm_type, q):
+            tmp = dpt.empty(
+                arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            ht_e_red, red_ev = _reduction_fn(
+                src=tmp,
+                trailing_dims_to_reduce=red_nd,
+                dst=out,
+                sycl_queue=q,
+                depends=[cpy_e],
+            )
+            _manager.add_event_pair(ht_e_red, red_ev)
+        else:
+            buf_dt = _default_reduction_type_fn(inp_dt, q)
+            tmp = dpt.empty(
+                arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=arr, dst=tmp, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_e_cpy, cpy_e)
+            tmp_res = dpt.empty(
+                res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
+            )
+            ht_e_red, r_e = _reduction_fn(
+                src=tmp,
+                trailing_dims_to_reduce=red_nd,
+                dst=tmp_res,
+                sycl_queue=q,
+                depends=[cpy_e],
+            )
+            _manager.add_event_pair(ht_e_red, r_e)
+            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=tmp_res, dst=out, sycl_queue=q, depends=[r_e]
+            )
+            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm)
+    return out
+
+
+def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+        perm = list(axis)
+        x_tmp = x
+    else:
+        if isinstance(axis, int):
+            axis = (axis,)
+        else:
+            raise TypeError(
+                f"'axis' argument expected to have type 'int' "
+                r"or be `None`, "
+                f"got type {type(axis)}"
+            )
+        axis = normalize_axis_tuple(axis, nd, "axis")
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        x_tmp = dpt.permute_dims(x, perm)
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    red_nd = len(axis)
+    if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]):
+        raise ValueError("reduction cannot be performed over zero-size axes")
+    res_shape = x_tmp.shape[: nd - red_nd]
+    exec_q = x.sycl_queue
+    res_dt = ti.default_device_index_type(exec_q.sycl_device)
+    res_usm_type = x.usm_type
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                f"output array must be of usm_ndarray type, got {type(out)}"
+            )
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+        if not keepdims:
+            final_res_shape = res_shape
+        else:
+            inp_shape = x.shape
+            final_res_shape = tuple(
+                inp_shape[i] if i not in axis else 1 for i in range(nd)
+            )
+        if not out.shape == final_res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are inconsistent. "
+                f"Expected output shape is {final_res_shape}, got {out.shape}"
+            )
+        if res_dt != out.dtype:
+            raise ValueError(
+                f"Output array of type {res_dt} is needed, got {out.dtype}"
+            )
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+        if keepdims:
+            out = dpt.squeeze(out, axis=axis)
+            orig_out = out
+        if ti._array_overlap(x, out) and red_nd > 0:
+            out = dpt.empty_like(out)
+    else:
+        out = dpt.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if red_nd == 0:
+        ht_e_fill, fill_ev = ti._full_usm_ndarray(
+            fill_value=0, dst=out, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_fill, fill_ev)
+        return out
+
+    hev, red_ev = _reduction_fn(
+        src=x_tmp,
+        trailing_dims_to_reduce=red_nd,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev, red_ev)
+    if not (orig_out is None or orig_out is out):
+        ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev]
+        )
+        _manager.add_event_pair(ht_e_cpy2, cpy2_e)
+        out = orig_out
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm)
+    return out
+
+
+def argmax(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Returns the indices of the maximum values of the input array ``x`` along a
+    specified axis.
+
+    When the maximum value occurs multiple times, the indices corresponding to
+    the first occurrence are returned.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to search. If ``None``, returns the index of the
+            maximum value of the flattened array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the indices of the first occurrence of the
+            maximum values. If the entire array was searched, a
+            zero-dimensional array is returned. The returned array has the
+            default array index data type for the device of ``x``.
+    """
+    return _search_over_axis(x, axis, keepdims, out, tri._argmax_over_axis)
+
+
+def argmin(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Returns the indices of the minimum values of the input array ``x`` along a
+    specified axis.
+
+    When the minimum value occurs multiple times, the indices corresponding to
+    the first occurrence are returned.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to search. If ``None``, returns the index of the
+            minimum value of the flattened array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the indices of the first occurrence of the
+            minimum values. If the entire array was searched, a
+            zero-dimensional array is returned. The returned array has the
+            default array index data type for the device of ``x``.
+    """
+    return _search_over_axis(x, axis, keepdims, out, tri._argmin_over_axis)
+
+
+def count_nonzero(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Counts the number of elements in the input array ``x`` which are non-zero.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which to count. If a tuple of unique integers,
+            the number of non-zero values are computed over multiple axes.
+            If ``None``, the number of non-zero values is computed over the
+            entire array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and data
+            type.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the count of non-zero values. If the sum was
+            computed over the entire array, a zero-dimensional array is
+            returned. The returned array will have the default array index data
+            type.
+    """
+    if x.dtype != dpt.bool:
+        x = dpt.astype(x, dpt.bool, copy=False)
+    return sum(
+        x,
+        axis=axis,
+        dtype=ti.default_device_index_type(x.sycl_device),
+        keepdims=keepdims,
+        out=out,
+    )
+
+
+def logsumexp(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the logarithm of the sum of exponentials of elements in the
+    input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which values must be computed. If a tuple
+            of unique integers, values are computed over multiple axes.
+            If ``None``, the result is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real-valued floating-point data type, the
+              returned array will have the same data type as ``x``.
+            * If ``x`` has a boolean or integral data type, the returned array
+              will have the default floating point data type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has a complex-valued floating-point data type,
+              an error is raised.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the result.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the results. If the result was computed over
+            the entire array, a zero-dimensional array is returned.
+            The returned array has the data type as described in the
+            ``dtype`` parameter description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._logsumexp_over_axis,
+        lambda inp_dt, res_dt, *_: tri._logsumexp_over_axis_dtype_supported(
+            inp_dt, res_dt
+        ),
+        _default_accumulation_dtype_fp_types,
+    )
+
+
+def max(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Calculates the maximum value of the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which maxima must be computed. If a tuple
+            of unique integers, the maxima are computed over multiple axes.
+            If ``None``, the max is computed over the entire array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the maxima. If the max was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the same data type as ``x``.
+    """
+    return _comparison_over_axis(x, axis, keepdims, out, tri._max_over_axis)
+
+
+def min(x, /, *, axis=None, keepdims=False, out=None):
+    """
+    Calculates the minimum value of the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which minima must be computed. If a tuple
+            of unique integers, the minima are computed over multiple axes.
+            If ``None``, the min is computed over the entire array.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the minima. If the min was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the same data type as ``x``.
+    """
+    return _comparison_over_axis(x, axis, keepdims, out, tri._min_over_axis)
+
+
+def prod(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the product of elements in the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which products must be computed. If a tuple
+            of unique integers, products are computed over multiple axes.
+            If ``None``, the product is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real- or complex-valued floating-point data
+              type, the returned array will have the same data type as
+              ``x``.
+            * If ``x`` has signed integral data type, the returned array
+              will have the default signed integral type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has unsigned integral data type, the returned array
+              will have the default unsigned integral type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has a boolean data type, the returned array will
+              have the default signed integral type for the device
+              where input array ``x`` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the product.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the products. If the product was computed over
+            the entire array, a zero-dimensional array is returned. The
+            returned array has the data type as described in the ``dtype``
+            parameter description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._prod_over_axis,
+        tri._prod_over_axis_dtype_supported,
+        _default_accumulation_dtype,
+    )
+
+
+def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the square root of the sum of squares of elements in the input
+    array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which values must be computed. If a tuple
+            of unique integers, values are computed over multiple axes.
+            If ``None``, the result is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real-valued floating-point data type, the
+              returned array will have the same data type as ``x``.
+            * If ``x`` has a boolean or integral data type, the returned array
+              will have the default floating point data type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has a complex-valued floating-point data type,
+              an error is raised.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the result. Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the results. If the result was computed over
+            the entire array, a zero-dimensional array is returned. The
+            returned array has the data type as described in the ``dtype``
+            parameter description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._hypot_over_axis,
+        lambda inp_dt, res_dt, *_: tri._hypot_over_axis_dtype_supported(
+            inp_dt, res_dt
+        ),
+        _default_accumulation_dtype_fp_types,
+    )
+
+
+def sum(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
+    """
+    Calculates the sum of elements in the input array ``x``.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which sums must be computed. If a tuple
+            of unique integers, sums are computed over multiple axes.
+            If ``None``, the sum is computed over the entire array.
+            Default: ``None``.
+        dtype (Optional[dtype]):
+            data type of the returned array. If ``None``, the default data
+            type is inferred from the "kind" of the input array data type.
+
+            * If ``x`` has a real- or complex-valued floating-point data
+              type, the returned array will have the same data type as
+              ``x``.
+            * If ``x`` has signed integral data type, the returned array
+              will have the default signed integral type for the device
+              where input array ``x`` is allocated.
+            * If ``x`` has unsigned integral data type, the returned array
+              will have the default unsigned integral type for the device
+              where input array ``x`` is allocated.
+              array ``x`` is allocated.
+            * If ``x`` has a boolean data type, the returned array will
+              have the default signed integral type for the device
+              where input array ``x`` is allocated.
+
+            If the data type (either specified or resolved) differs from the
+            data type of ``x``, the input array elements are cast to the
+            specified data type before computing the sum.
+            Default: ``None``.
+        keepdims (Optional[bool]):
+            if ``True``, the reduced axes (dimensions) are included in the
+            result as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if ``False``, the reduced axes are not included
+            in the returned array. Default: ``False``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of ``out`` must match the expected shape and the
+            expected data type of the result or (if provided) ``dtype``.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            an array containing the sums. If the sum was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the data type as described in the ``dtype`` parameter
+            description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        out,
+        tri._sum_over_axis,
+        tri._sum_over_axis_dtype_supported,
+        _default_accumulation_dtype,
+    )
diff --git a/dpnp/tensor/_reshape.py b/dpnp/tensor/_reshape.py
new file mode 100644
index 000000000000..0187ae496003
--- /dev/null
+++ b/dpnp/tensor/_reshape.py
@@ -0,0 +1,208 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+
+import numpy as np
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+
+from ._tensor_impl import (
+    _copy_usm_ndarray_for_reshape,
+    _ravel_multi_index,
+    _unravel_index,
+)
+
+__doc__ = "Implementation module for :func:`dpctl.tensor.reshape`."
+
+
+def _make_unit_indexes(shape):
+    """
+    Construct a diagonal matrix with with one on the diagonal
+    except if the corresponding element of shape is 1.
+    """
+    nd = len(shape)
+    mi = np.zeros((nd, nd), dtype="u4")
+    for i, dim in enumerate(shape):
+        mi[i, i] = 1 if dim > 1 else 0
+    return mi
+
+
+def ti_unravel_index(flat_index, shape, order="C"):
+    return _unravel_index(flat_index, shape, order)
+
+
+def ti_ravel_multi_index(multi_index, shape, order="C"):
+    return _ravel_multi_index(multi_index, shape, order)
+
+
+def reshaped_strides(old_sh, old_sts, new_sh, order="C"):
+    """
+    When reshaping array with `old_sh` shape and `old_sts` strides
+    into the new shape `new_sh`, returns the new stride if the reshape
+    can be a view, otherwise returns `None`.
+    """
+    eye_new_mi = _make_unit_indexes(new_sh)
+    new_sts = [
+        sum(
+            st_i * ind_i
+            for st_i, ind_i in zip(
+                old_sts, ti_unravel_index(flat_index, old_sh, order=order)
+            )
+        )
+        for flat_index in [
+            ti_ravel_multi_index(unitvec, new_sh, order=order)
+            for unitvec in eye_new_mi
+        ]
+    ]
+    eye_old_mi = _make_unit_indexes(old_sh)
+    check_sts = [
+        sum(
+            st_i * ind_i
+            for st_i, ind_i in zip(
+                new_sts, ti_unravel_index(flat_index, new_sh, order=order)
+            )
+        )
+        for flat_index in [
+            ti_ravel_multi_index(unitvec, old_sh, order=order)
+            for unitvec in eye_old_mi
+        ]
+    ]
+    valid = all(
+        check_st == old_st or old_dim == 1
+        for check_st, old_st, old_dim in zip(check_sts, old_sts, old_sh)
+    )
+    return new_sts if valid else None
+
+
+def reshape(X, /, shape, *, order="C", copy=None):
+    """reshape(x, shape, order="C")
+
+    Reshapes array ``x`` into new shape.
+
+    Args:
+        x (usm_ndarray):
+            input array
+        shape (Tuple[int]):
+            the desired shape of the resulting array.
+        order ("C", "F", optional):
+            memory layout of the resulting array
+            if a copy is found to be necessary. Supported
+            choices are ``"C"`` for C-contiguous, or row-major layout;
+            and ``"F"`` for F-contiguous, or column-major layout.
+
+    Returns:
+        usm_ndarray:
+            Reshaped array is a view, if possible,
+            and a copy otherwise with memory layout as indicated
+            by ``order`` keyword.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError
+    if not isinstance(shape, (list, tuple)):
+        shape = (shape,)
+    if order in "cfCF":
+        order = order.upper()
+    else:
+        raise ValueError(
+            f"Keyword 'order' not recognized. Expecting 'C' or 'F', got {order}"
+        )
+    if copy not in (True, False, None):
+        raise ValueError(
+            f"Keyword 'copy' not recognized. Expecting True, False, "
+            f"or None, got {copy}"
+        )
+    shape = [operator.index(d) for d in shape]
+    negative_ones_count = 0
+    for nshi in shape:
+        if nshi == -1:
+            negative_ones_count = negative_ones_count + 1
+        if (nshi < -1) or negative_ones_count > 1:
+            raise ValueError(
+                "Target shape should have at most 1 negative "
+                "value which can only be -1"
+            )
+    if negative_ones_count:
+        sz = -np.prod(shape)
+        if sz == 0:
+            raise ValueError(
+                f"Can not reshape array of size {X.size} into "
+                f"shape {tuple(i for i in shape if i >= 0)}"
+            )
+        v = X.size // sz
+        shape = [v if d == -1 else d for d in shape]
+    if X.size != np.prod(shape):
+        raise ValueError(f"Can not reshape into {shape}")
+    if X.size:
+        newsts = reshaped_strides(X.shape, X.strides, shape, order=order)
+    else:
+        newsts = (1,) * len(shape)
+    copy_required = newsts is None
+    if copy_required and (copy is False):
+        raise ValueError(
+            "Reshaping the array requires a copy, but no copying was "
+            "requested by using copy=False"
+        )
+    copy_q = X.sycl_queue
+    if copy_required or (copy is True):
+        # must perform a copy
+        copy_q = X.sycl_queue
+        flat_res = dpt.usm_ndarray(
+            (X.size,),
+            dtype=X.dtype,
+            buffer=X.usm_type,
+            buffer_ctor_kwargs={"queue": copy_q},
+        )
+        _manager = SequentialOrderManager[copy_q]
+        dep_evs = _manager.submitted_events
+        if order == "C":
+            hev, r_e = _copy_usm_ndarray_for_reshape(
+                src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
+            )
+        else:
+            X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1))
+            hev, r_e = _copy_usm_ndarray_for_reshape(
+                src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
+            )
+        _manager.add_event_pair(hev, r_e)
+        return dpt.usm_ndarray(
+            tuple(shape), dtype=X.dtype, buffer=flat_res, order=order
+        )
+    # can form a view
+    if (len(shape) == X.ndim) and all(
+        s1 == s2 for s1, s2 in zip(shape, X.shape)
+    ):
+        return X
+    return dpt.usm_ndarray(
+        shape,
+        dtype=X.dtype,
+        buffer=X,
+        strides=tuple(newsts),
+        offset=X._element_offset,
+    )
diff --git a/dpnp/tensor/_scalar_utils.py b/dpnp/tensor/_scalar_utils.py
new file mode 100644
index 000000000000..828f01f1c862
--- /dev/null
+++ b/dpnp/tensor/_scalar_utils.py
@@ -0,0 +1,123 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numbers
+
+import dpctl.memory as dpm
+import numpy as np
+
+import dpnp.tensor as dpt
+
+from ._type_utils import (
+    WeakBooleanType,
+    WeakComplexType,
+    WeakFloatingType,
+    WeakIntegralType,
+    _to_device_supported_dtype,
+)
+from ._usmarray import _is_object_with_buffer_protocol as _is_buffer
+
+
+def _get_queue_usm_type(o):
+    """Return SYCL device where object `o` allocated memory, or None."""
+    if isinstance(o, dpt.usm_ndarray):
+        return o.sycl_queue, o.usm_type
+    elif hasattr(o, "__sycl_usm_array_interface__"):
+        try:
+            m = dpm.as_usm_memory(o)
+            return m.sycl_queue, m.get_usm_type()
+        except Exception:
+            return None, None
+    return None, None
+
+
+def _get_dtype(o, dev):
+    if isinstance(o, dpt.usm_ndarray):
+        return o.dtype
+    if hasattr(o, "__sycl_usm_array_interface__"):
+        return dpt.asarray(o).dtype
+    if _is_buffer(o):
+        host_dt = np.array(o).dtype
+        dev_dt = _to_device_supported_dtype(host_dt, dev)
+        return dev_dt
+    if hasattr(o, "dtype"):
+        dev_dt = _to_device_supported_dtype(o.dtype, dev)
+        return dev_dt
+    if isinstance(o, bool):
+        return WeakBooleanType(o)
+    if isinstance(o, int):
+        return WeakIntegralType(o)
+    if isinstance(o, float):
+        return WeakFloatingType(o)
+    if isinstance(o, complex):
+        return WeakComplexType(o)
+    return np.object_
+
+
+def _validate_dtype(dt) -> bool:
+    return isinstance(
+        dt,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    ) or (
+        isinstance(dt, dpt.dtype)
+        and dt
+        in [
+            dpt.bool,
+            dpt.int8,
+            dpt.uint8,
+            dpt.int16,
+            dpt.uint16,
+            dpt.int32,
+            dpt.uint32,
+            dpt.int64,
+            dpt.uint64,
+            dpt.float16,
+            dpt.float32,
+            dpt.float64,
+            dpt.complex64,
+            dpt.complex128,
+        ]
+    )
+
+
+def _get_shape(o):
+    if isinstance(o, dpt.usm_ndarray):
+        return o.shape
+    if _is_buffer(o):
+        return memoryview(o).shape
+    if isinstance(o, numbers.Number):
+        return ()
+    return getattr(o, "shape", tuple())
+
+
+__all__ = [
+    "_get_dtype",
+    "_get_queue_usm_type",
+    "_get_shape",
+    "_validate_dtype",
+]
diff --git a/dpnp/tensor/_search_functions.py b/dpnp/tensor/_search_functions.py
new file mode 100644
index 000000000000..c1d45ee4bb33
--- /dev/null
+++ b/dpnp/tensor/_search_functions.py
@@ -0,0 +1,415 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+
+from ._copy_utils import _empty_like_orderK, _empty_like_triple_orderK
+from ._manipulation_functions import _broadcast_shape_impl
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
+from ._type_utils import (
+    WeakBooleanType,
+    WeakComplexType,
+    WeakFloatingType,
+    WeakIntegralType,
+    _all_data_types,
+    _can_cast,
+    _is_weak_dtype,
+    _strong_dtype_num_kind,
+    _to_device_supported_dtype,
+    _weak_type_num_kind,
+)
+
+
+def _default_dtype_from_weak_type(dt, dev):
+    if isinstance(dt, WeakBooleanType):
+        return dpt.bool
+    if isinstance(dt, WeakIntegralType):
+        return dpt.dtype(ti.default_device_int_type(dev))
+    if isinstance(dt, WeakFloatingType):
+        return dpt.dtype(ti.default_device_fp_type(dev))
+    if isinstance(dt, WeakComplexType):
+        return dpt.dtype(ti.default_device_complex_type(dev))
+
+
+def _resolve_two_weak_types(o1_dtype, o2_dtype, dev):
+    """Resolves two weak data types per NEP-0050"""
+    if _is_weak_dtype(o1_dtype):
+        if _is_weak_dtype(o2_dtype):
+            return _default_dtype_from_weak_type(
+                o1_dtype, dev
+            ), _default_dtype_from_weak_type(o2_dtype, dev)
+        o1_kind_num = _weak_type_num_kind(o1_dtype)
+        o2_kind_num = _strong_dtype_num_kind(o2_dtype)
+        if o1_kind_num > o2_kind_num:
+            if isinstance(o1_dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype
+            if isinstance(o1_dtype, WeakComplexType):
+                if o2_dtype is dpt.float16 or o2_dtype is dpt.float32:
+                    return dpt.complex64, o2_dtype
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    o2_dtype,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), o2_dtype
+        else:
+            return o2_dtype, o2_dtype
+    elif _is_weak_dtype(o2_dtype):
+        o1_kind_num = _strong_dtype_num_kind(o1_dtype)
+        o2_kind_num = _weak_type_num_kind(o2_dtype)
+        if o2_kind_num > o1_kind_num:
+            if isinstance(o2_dtype, WeakIntegralType):
+                return o1_dtype, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(o2_dtype, WeakComplexType):
+                if o1_dtype is dpt.float16 or o1_dtype is dpt.float32:
+                    return o1_dtype, dpt.complex64
+                return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev)
+            return (
+                o1_dtype,
+                _to_device_supported_dtype(dpt.float64, dev),
+            )
+        else:
+            return o1_dtype, o1_dtype
+    else:
+        return o1_dtype, o2_dtype
+
+
+def _where_result_type(dt1, dt2, dev):
+    res_dtype = dpt.result_type(dt1, dt2)
+    fp16 = dev.has_aspect_fp16
+    fp64 = dev.has_aspect_fp64
+
+    all_dts = _all_data_types(fp16, fp64)
+    if res_dtype in all_dts:
+        return res_dtype
+    else:
+        for res_dtype_ in all_dts:
+            if _can_cast(dt1, res_dtype_, fp16, fp64) and _can_cast(
+                dt2, res_dtype_, fp16, fp64
+            ):
+                return res_dtype_
+        return None
+
+
+def where(condition, x1, x2, /, *, order="K", out=None):
+    """
+    Returns :class:`dpctl.tensor.usm_ndarray` with elements chosen
+    from ``x1`` or ``x2`` depending on ``condition``.
+
+    Args:
+        condition (usm_ndarray): When ``True`` yields from ``x1``,
+            and otherwise yields from ``x2``.
+            Must be compatible with ``x1`` and ``x2`` according
+            to broadcasting rules.
+        x1 (Union[usm_ndarray, bool, int, float, complex]):
+            Array from which values are chosen when ``condition`` is ``True``.
+            Must be compatible with ``condition`` and ``x2`` according
+            to broadcasting rules.
+        x2 (Union[usm_ndarray, bool, int, float, complex]):
+            Array from which values are chosen when ``condition`` is not
+            ``True``.
+            Must be compatible with ``condition`` and ``x2`` according
+            to broadcasting rules.
+        order (``"K"``, ``"C"``, ``"F"``, ``"A"``, optional):
+            Memory layout of the new output array,
+            if parameter ``out`` is ``None``.
+            Default: ``"K"``.
+        out (Optional[usm_ndarray]):
+            the array into which the result is written.
+            The data type of `out` must match the expected shape and the
+            expected data type of the result.
+            If ``None`` then a new array is returned. Default: ``None``.
+
+    Returns:
+        usm_ndarray:
+            An array with elements from ``x1`` where ``condition`` is ``True``,
+            and elements from ``x2`` elsewhere.
+
+    The data type of the returned array is determined by applying
+    the Type Promotion Rules to ``x1`` and ``x2``.
+    """
+    if not isinstance(condition, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(condition)}"
+        )
+    if order not in ["K", "C", "F", "A"]:
+        order = "K"
+    q1, condition_usm_type = condition.sycl_queue, condition.usm_type
+    q2, x1_usm_type = _get_queue_usm_type(x1)
+    q3, x2_usm_type = _get_queue_usm_type(x2)
+    if q2 is None and q3 is None:
+        exec_q = q1
+        out_usm_type = condition_usm_type
+    elif q3 is None:
+        exec_q = dpt.get_execution_queue((q1, q2))
+        if exec_q is None:
+            raise dpt.ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        out_usm_type = dpt.get_coerced_usm_type(
+            (
+                condition_usm_type,
+                x1_usm_type,
+            )
+        )
+    elif q2 is None:
+        exec_q = dpt.get_execution_queue((q1, q3))
+        if exec_q is None:
+            raise dpt.ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        out_usm_type = dpt.get_coerced_usm_type(
+            (
+                condition_usm_type,
+                x2_usm_type,
+            )
+        )
+    else:
+        exec_q = dpt.get_execution_queue((q1, q2, q3))
+        if exec_q is None:
+            raise dpt.ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        out_usm_type = dpt.get_coerced_usm_type(
+            (
+                condition_usm_type,
+                x1_usm_type,
+                x2_usm_type,
+            )
+        )
+    dpt.validate_usm_type(out_usm_type, allow_none=False)
+    condition_shape = condition.shape
+    x1_shape = _get_shape(x1)
+    x2_shape = _get_shape(x2)
+    if not all(
+        isinstance(s, (tuple, list))
+        for s in (
+            x1_shape,
+            x2_shape,
+        )
+    ):
+        raise TypeError(
+            "Shape of arguments can not be inferred. "
+            "Arguments are expected to be "
+            "lists, tuples, or both"
+        )
+    try:
+        res_shape = _broadcast_shape_impl(
+            [
+                condition_shape,
+                x1_shape,
+                x2_shape,
+            ]
+        )
+    except ValueError:
+        raise ValueError(
+            "operands could not be broadcast together with shapes "
+            f"{condition_shape}, {x1_shape}, and {x2_shape}"
+        )
+    sycl_dev = exec_q.sycl_device
+    x1_dtype = _get_dtype(x1, sycl_dev)
+    x2_dtype = _get_dtype(x2, sycl_dev)
+    if not all(_validate_dtype(o) for o in (x1_dtype, x2_dtype)):
+        raise ValueError("Operands have unsupported data types")
+    x1_dtype, x2_dtype = _resolve_two_weak_types(x1_dtype, x2_dtype, sycl_dev)
+    out_dtype = _where_result_type(x1_dtype, x2_dtype, sycl_dev)
+    if out_dtype is None:
+        raise TypeError(
+            "function 'where' does not support input "
+            f"types ({x1_dtype}, {x2_dtype}), "
+            "and the inputs could not be safely coerced "
+            "to any supported types according to the casting rule ''safe''."
+        )
+
+    orig_out = out
+    if out is not None:
+        if not isinstance(out, dpt.usm_ndarray):
+            raise TypeError(
+                "output array must be of usm_ndarray type, got " f"{type(out)}"
+            )
+
+        if not out.flags.writable:
+            raise ValueError("provided `out` array is read-only")
+
+        if out.shape != res_shape:
+            raise ValueError(
+                "The shape of input and output arrays are "
+                f"inconsistent. Expected output shape is {res_shape}, "
+                f"got {out.shape}"
+            )
+
+        if out_dtype != out.dtype:
+            raise ValueError(
+                f"Output array of type {out_dtype} is needed, "
+                f"got {out.dtype}"
+            )
+
+        if dpt.get_execution_queue((exec_q, out.sycl_queue)) is None:
+            raise dpt.ExecutionPlacementError(
+                "Input and output allocation queues are not compatible"
+            )
+
+        if ti._array_overlap(condition, out) and not ti._same_logical_tensors(
+            condition, out
+        ):
+            out = dpt.empty_like(out)
+
+        if isinstance(x1, dpt.usm_ndarray):
+            if (
+                ti._array_overlap(x1, out)
+                and not ti._same_logical_tensors(x1, out)
+                and x1_dtype == out_dtype
+            ):
+                out = dpt.empty_like(out)
+
+        if isinstance(x2, dpt.usm_ndarray):
+            if (
+                ti._array_overlap(x2, out)
+                and not ti._same_logical_tensors(x2, out)
+                and x2_dtype == out_dtype
+            ):
+                out = dpt.empty_like(out)
+
+    if order == "A":
+        order = (
+            "F"
+            if all(
+                arr.flags.f_contiguous
+                for arr in (
+                    condition,
+                    x1,
+                    x2,
+                )
+            )
+            else "C"
+        )
+    if not isinstance(x1, dpt.usm_ndarray):
+        x1 = dpt.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q)
+    if not isinstance(x2, dpt.usm_ndarray):
+        x2 = dpt.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q)
+
+    if condition.size == 0:
+        if out is not None:
+            return out
+        else:
+            if order == "K":
+                return _empty_like_triple_orderK(
+                    condition,
+                    x1,
+                    x2,
+                    out_dtype,
+                    res_shape,
+                    out_usm_type,
+                    exec_q,
+                )
+            else:
+                return dpt.empty(
+                    res_shape,
+                    dtype=out_dtype,
+                    order=order,
+                    usm_type=out_usm_type,
+                    sycl_queue=exec_q,
+                )
+
+    _manager = SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if x1_dtype != out_dtype:
+        if order == "K":
+            _x1 = _empty_like_orderK(x1, out_dtype)
+        else:
+            _x1 = dpt.empty_like(x1, dtype=out_dtype, order=order)
+        ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x1, dst=_x1, sycl_queue=exec_q, depends=dep_evs
+        )
+        x1 = _x1
+        _manager.add_event_pair(ht_copy1_ev, copy1_ev)
+
+    if x2_dtype != out_dtype:
+        if order == "K":
+            _x2 = _empty_like_orderK(x2, out_dtype)
+        else:
+            _x2 = dpt.empty_like(x2, dtype=out_dtype, order=order)
+        ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x2, dst=_x2, sycl_queue=exec_q, depends=dep_evs
+        )
+        x2 = _x2
+        _manager.add_event_pair(ht_copy2_ev, copy2_ev)
+
+    if out is None:
+        if order == "K":
+            out = _empty_like_triple_orderK(
+                condition, x1, x2, out_dtype, res_shape, out_usm_type, exec_q
+            )
+        else:
+            out = dpt.empty(
+                res_shape,
+                dtype=out_dtype,
+                order=order,
+                usm_type=out_usm_type,
+                sycl_queue=exec_q,
+            )
+
+    if condition_shape != res_shape:
+        condition = dpt.broadcast_to(condition, res_shape)
+    if x1_shape != res_shape:
+        x1 = dpt.broadcast_to(x1, res_shape)
+    if x2_shape != res_shape:
+        x2 = dpt.broadcast_to(x2, res_shape)
+
+    dep_evs = _manager.submitted_events
+    hev, where_ev = ti._where(
+        condition=condition,
+        x1=x1,
+        x2=x2,
+        dst=out,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev, where_ev)
+    if not (orig_out is None or orig_out is out):
+        # Copy the out data from temporary buffer to original memory
+        ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=out,
+            dst=orig_out,
+            sycl_queue=exec_q,
+            depends=[where_ev],
+        )
+        _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
+        out = orig_out
+
+    return out
diff --git a/dpnp/tensor/_searchsorted.py b/dpnp/tensor/_searchsorted.py
new file mode 100644
index 000000000000..4c9b54cb63fa
--- /dev/null
+++ b/dpnp/tensor/_searchsorted.py
@@ -0,0 +1,189 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+from typing import Literal, Union
+
+import dpctl
+import dpctl.utils as du
+
+from ._compute_follows_data import (
+    ExecutionPlacementError,
+    get_coerced_usm_type,
+    get_execution_queue,
+)
+from ._copy_utils import _empty_like_orderK
+from ._ctors import empty
+from ._tensor_impl import _copy_usm_ndarray_into_usm_ndarray as ti_copy
+from ._tensor_impl import _take as ti_take
+from ._tensor_impl import (
+    default_device_index_type as ti_default_device_index_type,
+)
+from ._tensor_sorting_impl import _searchsorted_left, _searchsorted_right
+from ._type_utils import isdtype, result_type
+from ._usmarray import usm_ndarray
+
+
+def searchsorted(
+    x1: usm_ndarray,
+    x2: usm_ndarray,
+    /,
+    *,
+    side: Literal["left", "right"] = "left",
+    sorter: Union[usm_ndarray, None] = None,
+) -> usm_ndarray:
+    """searchsorted(x1, x2, side='left', sorter=None)
+
+    Finds the indices into `x1` such that, if the corresponding elements
+    in `x2` were inserted before the indices, the order of `x1`, when sorted
+    in ascending order, would be preserved.
+
+    Args:
+        x1 (usm_ndarray):
+            input array. Must be a one-dimensional array. If `sorter` is
+            `None`, must be sorted in ascending order; otherwise, `sorter` must
+            be an array of indices that sort `x1` in ascending order.
+        x2 (usm_ndarray):
+            array containing search values.
+        side (Literal["left", "right]):
+            argument controlling which index is returned if a value lands
+            exactly on an edge. If `x2` is an array of rank `N` where
+            `v = x2[n, m, ..., j]`, the element `ret[n, m, ..., j]` in the
+            return array `ret` contains the position `i` such that
+            if `side="left"`, it is the first index such that
+            `x1[i-1] < v <= x1[i]`, `0` if `v <= x1[0]`, and `x1.size`
+            if `v > x1[-1]`;
+            and if `side="right"`, it is the first position `i` such that
+            `x1[i-1] <= v < x1[i]`, `0` if `v < x1[0]`, and `x1.size`
+            if `v >= x1[-1]`. Default: `"left"`.
+        sorter (Optional[usm_ndarray]):
+            array of indices that sort `x1` in ascending order. The array must
+            have the same shape as `x1` and have an integral data type.
+            Out of bound index values of `sorter` array are treated using
+            `"wrap"` mode documented in :py:func:`dpctl.tensor.take`.
+            Default: `None`.
+    """
+    if not isinstance(x1, usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x1)}")
+    if not isinstance(x2, usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x2)}")
+    if sorter is not None and not isinstance(sorter, usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(sorter)}")
+
+    if side not in ["left", "right"]:
+        raise ValueError(
+            "Unrecognized value of 'side' keyword argument. "
+            "Expected either 'left' or 'right'"
+        )
+
+    if sorter is None:
+        q = get_execution_queue([x1.sycl_queue, x2.sycl_queue])
+    else:
+        q = get_execution_queue(
+            [x1.sycl_queue, x2.sycl_queue, sorter.sycl_queue]
+        )
+    if q is None:
+        raise ExecutionPlacementError(
+            "Execution placement can not be unambiguously "
+            "inferred from input arguments."
+        )
+
+    if x1.ndim != 1:
+        raise ValueError("First argument array must be one-dimensional")
+
+    x1_dt = x1.dtype
+    x2_dt = x2.dtype
+
+    _manager = du.SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    ev = dpctl.SyclEvent()
+    if sorter is not None:
+        if not isdtype(sorter.dtype, "integral"):
+            raise ValueError(
+                f"Sorter array must have integral data type, got {sorter.dtype}"
+            )
+        if x1.shape != sorter.shape:
+            raise ValueError(
+                "Sorter array must be one-dimension with the same "
+                "shape as the first argument array"
+            )
+        res = empty(x1.shape, dtype=x1_dt, usm_type=x1.usm_type, sycl_queue=q)
+        ind = (sorter,)
+        axis = 0
+        wrap_out_of_bound_indices_mode = 0
+        ht_ev, ev = ti_take(
+            x1,
+            ind,
+            res,
+            axis,
+            wrap_out_of_bound_indices_mode,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+        x1 = res
+        _manager.add_event_pair(ht_ev, ev)
+
+    if x1_dt != x2_dt:
+        dt = result_type(x1, x2)
+        if x1_dt != dt:
+            x1_buf = _empty_like_orderK(x1, dt)
+            dep_evs = _manager.submitted_events
+            ht_ev, ev = ti_copy(
+                src=x1, dst=x1_buf, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_ev, ev)
+            x1 = x1_buf
+        if x2_dt != dt:
+            x2_buf = _empty_like_orderK(x2, dt)
+            dep_evs = _manager.submitted_events
+            ht_ev, ev = ti_copy(
+                src=x2, dst=x2_buf, sycl_queue=q, depends=dep_evs
+            )
+            _manager.add_event_pair(ht_ev, ev)
+            x2 = x2_buf
+
+    dst_usm_type = get_coerced_usm_type([x1.usm_type, x2.usm_type])
+    index_dt = ti_default_device_index_type(q)
+
+    dst = _empty_like_orderK(x2, index_dt, usm_type=dst_usm_type)
+
+    dep_evs = _manager.submitted_events
+    if side == "left":
+        ht_ev, s_ev = _searchsorted_left(
+            hay=x1,
+            needles=x2,
+            positions=dst,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+    else:
+        ht_ev, s_ev = _searchsorted_right(
+            hay=x1, needles=x2, positions=dst, sycl_queue=q, depends=dep_evs
+        )
+    _manager.add_event_pair(ht_ev, s_ev)
+    return dst
diff --git a/dpnp/tensor/_set_functions.py b/dpnp/tensor/_set_functions.py
new file mode 100644
index 000000000000..067de75c42ce
--- /dev/null
+++ b/dpnp/tensor/_set_functions.py
@@ -0,0 +1,794 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from typing import NamedTuple, Optional, Union
+
+import dpctl.utils as du
+
+import dpnp.tensor as dpt
+
+from ._copy_utils import _empty_like_orderK
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
+from ._tensor_elementwise_impl import _not_equal, _subtract
+from ._tensor_impl import (
+    _copy_usm_ndarray_into_usm_ndarray,
+    _extract,
+    _full_usm_ndarray,
+    _linspace_step,
+    _take,
+    default_device_index_type,
+    mask_positions,
+)
+from ._tensor_sorting_impl import (
+    _argsort_ascending,
+    _isin,
+    _searchsorted_left,
+    _sort_ascending,
+)
+from ._type_utils import (
+    _resolve_weak_types_all_py_ints,
+    _to_device_supported_dtype,
+)
+
+__all__ = [
+    "isin",
+    "unique_values",
+    "unique_counts",
+    "unique_inverse",
+    "unique_all",
+    "UniqueAllResult",
+    "UniqueCountsResult",
+    "UniqueInverseResult",
+]
+
+
+class UniqueAllResult(NamedTuple):
+    values: dpt.usm_ndarray
+    indices: dpt.usm_ndarray
+    inverse_indices: dpt.usm_ndarray
+    counts: dpt.usm_ndarray
+
+
+class UniqueCountsResult(NamedTuple):
+    values: dpt.usm_ndarray
+    counts: dpt.usm_ndarray
+
+
+class UniqueInverseResult(NamedTuple):
+    values: dpt.usm_ndarray
+    inverse_indices: dpt.usm_ndarray
+
+
+def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
+    """unique_values(x)
+
+    Returns the unique elements of an input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. Inputs with more than one dimension are flattened.
+    Returns:
+        usm_ndarray
+            an array containing the set of unique elements in `x`. The
+            returned array has the same data type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C")
+    if fx.size == 0:
+        return fx
+    s = dpt.empty_like(fx, order="C")
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _sort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=s,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    else:
+        tmp = dpt.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        ht_ev, sort_ev = _sort_ascending(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=s,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, uneq_ev)
+    # writing into new allocation, no dependencies
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, one_ev)
+    cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
+    )
+    if n_uniques == fx.size:
+        return s
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    ht_ev, ex_e = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(ht_ev, ex_e)
+    return unique_vals
+
+
+def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
+    """unique_counts(x)
+
+    Returns the unique elements of an input array `x` and the corresponding
+    counts for each unique element in `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. Inputs with more than one dimension are flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, counts)` whose
+
+            * first element is the field name `values` and is an array
+               containing the unique elements of `x`. This array has the
+               same data type as `x`.
+            * second element has the field name `counts` and is an array
+              containing the number of times each unique element occurs in `x`.
+              This array has the same shape as `values` and has the default
+              array index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C")
+    ind_dt = default_device_index_type(exec_q)
+    if fx.size == 0:
+        return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt))
+    s = dpt.empty_like(fx, order="C")
+
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _sort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=s,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    else:
+        tmp = dpt.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        ht_ev, sort_ev = _sort_ascending(
+            src=tmp,
+            dst=s,
+            trailing_dims_to_sort=1,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, uneq_ev)
+    # no dependency, since we write into new allocation
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, one_ev)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
+    )
+    if n_uniques == fx.size:
+        return UniqueCountsResult(
+            s,
+            dpt.ones(
+                n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+            ),
+        )
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    # populate unique values
+    ht_ev, ex_e = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(ht_ev, ex_e)
+    unique_counts = dpt.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    # writing into new allocation, no dependency
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    _manager.add_event_pair(ht_ev, id_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    _manager.add_event_pair(ht_ev, extr_ev)
+    # no dependency, writing into disjoint segmenent of new allocation
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=unique_counts[-1], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, set_ev)
+    _counts = dpt.empty_like(unique_counts[1:])
+    ht_ev, sub_ev = _subtract(
+        src1=unique_counts[1:],
+        src2=unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    _manager.add_event_pair(ht_ev, sub_ev)
+    return UniqueCountsResult(unique_vals, _counts)
+
+
+def unique_inverse(x):
+    """unique_inverse
+
+    Returns the unique elements of an input array x and the indices from the
+    set of unique elements that reconstruct `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. Inputs with more than one dimension are flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, inverse_indices)` whose
+
+            * first element has the field name `values` and is an array
+              containing the unique elements of `x`. The array has the same
+              data type as `x`.
+            * second element has the field name `inverse_indices` and is an
+              array containing the indices of values that reconstruct `x`.
+              The array has the same shape as `x` and has the default array
+              index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
+    ind_dt = default_device_index_type(exec_q)
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C")
+    sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
+    if fx.size == 0:
+        return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape))
+
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _argsort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    else:
+        tmp = dpt.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        ht_ev, sort_ev = _argsort_ascending(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    ht_ev, argsort_ev = _argsort_ascending(
+        src=sorting_ids,
+        trailing_dims_to_sort=1,
+        dst=unsorting_ids,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, argsort_ev)
+    s = dpt.empty_like(fx)
+    # s = fx[sorting_ids]
+    ht_ev, take_ev = _take(
+        src=fx,
+        ind=(sorting_ids,),
+        dst=s,
+        axis_start=0,
+        mode=0,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, take_ev)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[take_ev],
+    )
+    _manager.add_event_pair(ht_ev, uneq_ev)
+    # no dependency
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, one_ev)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
+    )
+    if n_uniques == fx.size:
+        return UniqueInverseResult(s, dpt.reshape(unsorting_ids, x.shape))
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    ht_ev, uv_ev = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(ht_ev, uv_ev)
+    cum_unique_counts = dpt.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    _manager.add_event_pair(ht_ev, id_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=cum_unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    _manager.add_event_pair(ht_ev, extr_ev)
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, set_ev)
+    _counts = dpt.empty_like(cum_unique_counts[1:])
+    ht_ev, sub_ev = _subtract(
+        src1=cum_unique_counts[1:],
+        src2=cum_unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    _manager.add_event_pair(ht_ev, sub_ev)
+
+    inv = dpt.empty_like(x, dtype=ind_dt, order="C")
+    ht_ev, ssl_ev = _searchsorted_left(
+        hay=unique_vals,
+        needles=x,
+        positions=inv,
+        sycl_queue=exec_q,
+        depends=[
+            uv_ev,
+        ],
+    )
+    _manager.add_event_pair(ht_ev, ssl_ev)
+
+    return UniqueInverseResult(unique_vals, inv)
+
+
+def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
+    """unique_all(x)
+
+    Returns the unique elements of an input array `x`, the first occurring
+    indices for each unique element in `x`, the indices from the set of unique
+    elements that reconstruct `x`, and the corresponding counts for each
+    unique element in `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array. Inputs with more than one dimension are flattened.
+    Returns:
+        tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray]
+            a namedtuple `(values, indices, inverse_indices, counts)` whose
+
+            * first element has the field name `values` and is an array
+              containing the unique elements of `x`. The array has the same
+              data type as `x`.
+            * second element has the field name `indices` and is an array
+              the indices (of first occurrences) of `x` that result in
+              `values`. The array has the same shape as `values` and has the
+              default array index data type.
+            * third element has the field name `inverse_indices` and is an
+              array containing the indices of values that reconstruct `x`.
+              The array has the same shape as `x` and has the default array
+              index data type.
+            * fourth element has the field name `counts` and is an array
+              containing the number of times each unique element occurs in `x`.
+              This array has the same shape as `values` and has the default
+              array index data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+    array_api_dev = x.device
+    exec_q = array_api_dev.sycl_queue
+    x_usm_type = x.usm_type
+    ind_dt = default_device_index_type(exec_q)
+    if x.ndim == 1:
+        fx = x
+    else:
+        fx = dpt.reshape(x, (x.size,), order="C")
+    sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
+    if fx.size == 0:
+        # original array contains no data
+        # so it can be safely returned as values
+        return UniqueAllResult(
+            fx,
+            sorting_ids,
+            dpt.reshape(unsorting_ids, x.shape),
+            dpt.empty_like(fx, dtype=ind_dt),
+        )
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if fx.flags.c_contiguous:
+        ht_ev, sort_ev = _argsort_ascending(
+            src=fx,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    else:
+        tmp = dpt.empty_like(fx, order="C")
+        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        ht_ev, sort_ev = _argsort_ascending(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=sorting_ids,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, sort_ev)
+    ht_ev, args_ev = _argsort_ascending(
+        src=sorting_ids,
+        trailing_dims_to_sort=1,
+        dst=unsorting_ids,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, args_ev)
+    s = dpt.empty_like(fx)
+    # s = fx[sorting_ids]
+    ht_ev, take_ev = _take(
+        src=fx,
+        ind=(sorting_ids,),
+        dst=s,
+        axis_start=0,
+        mode=0,
+        sycl_queue=exec_q,
+        depends=[sort_ev],
+    )
+    _manager.add_event_pair(ht_ev, take_ev)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    ht_ev, uneq_ev = _not_equal(
+        src1=s[:-1],
+        src2=s[1:],
+        dst=unique_mask[1:],
+        sycl_queue=exec_q,
+        depends=[take_ev],
+    )
+    _manager.add_event_pair(ht_ev, uneq_ev)
+    ht_ev, one_ev = _full_usm_ndarray(
+        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, one_ev)
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
+    # synchronizing call
+    n_uniques = mask_positions(
+        unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
+    )
+    if n_uniques == fx.size:
+        _counts = dpt.ones(
+            n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+        )
+        return UniqueAllResult(
+            s,
+            sorting_ids,
+            dpt.reshape(unsorting_ids, x.shape),
+            _counts,
+        )
+    unique_vals = dpt.empty(
+        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    ht_ev, uv_ev = _extract(
+        src=s,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=unique_vals,
+        sycl_queue=exec_q,
+    )
+    _manager.add_event_pair(ht_ev, uv_ev)
+    cum_unique_counts = dpt.empty(
+        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
+    )
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
+    _manager.add_event_pair(ht_ev, id_ev)
+    ht_ev, extr_ev = _extract(
+        src=idx,
+        cumsum=cumsum,
+        axis_start=0,
+        axis_end=1,
+        dst=cum_unique_counts[:-1],
+        sycl_queue=exec_q,
+        depends=[id_ev],
+    )
+    _manager.add_event_pair(ht_ev, extr_ev)
+    ht_ev, set_ev = _full_usm_ndarray(
+        x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
+    )
+    _manager.add_event_pair(ht_ev, set_ev)
+    _counts = dpt.empty_like(cum_unique_counts[1:])
+    ht_ev, sub_ev = _subtract(
+        src1=cum_unique_counts[1:],
+        src2=cum_unique_counts[:-1],
+        dst=_counts,
+        sycl_queue=exec_q,
+        depends=[set_ev, extr_ev],
+    )
+    _manager.add_event_pair(ht_ev, sub_ev)
+
+    inv = dpt.empty_like(x, dtype=ind_dt, order="C")
+    ht_ev, ssl_ev = _searchsorted_left(
+        hay=unique_vals,
+        needles=x,
+        positions=inv,
+        sycl_queue=exec_q,
+        depends=[
+            uv_ev,
+        ],
+    )
+    _manager.add_event_pair(ht_ev, ssl_ev)
+    return UniqueAllResult(
+        unique_vals,
+        sorting_ids[cum_unique_counts[:-1]],
+        inv,
+        _counts,
+    )
+
+
+def isin(
+    x: Union[dpt.usm_ndarray, int, float, complex, bool],
+    test_elements: Union[dpt.usm_ndarray, int, float, complex, bool],
+    /,
+    *,
+    invert: Optional[bool] = False,
+) -> dpt.usm_ndarray:
+    """isin(x, test_elements, /, *, invert=False)
+
+    Tests `x in test_elements` for each element of `x`. Returns a boolean array
+    with the same shape as `x` that is `True` where the element is in
+    `test_elements`, `False` otherwise.
+
+    Args:
+        x (Union[usm_ndarray, bool, int, float, complex]):
+            input element or elements.
+        test_elements (Union[usm_ndarray, bool, int, float, complex]):
+            elements against which to test each value of `x`.
+        invert (Optional[bool]):
+            if `True`, the output results are inverted, i.e., are equivalent to
+            testing `x not in test_elements` for each element of `x`.
+            Default: `False`.
+
+    Returns:
+        usm_ndarray:
+            an array of the inclusion test results. The returned array has a
+            boolean data type and the same shape as `x`.
+    """
+    q1, x_usm_type = _get_queue_usm_type(x)
+    q2, test_usm_type = _get_queue_usm_type(test_elements)
+    if q1 is None and q2 is None:
+        raise dpt.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments. "
+            "One of the arguments must represent USM allocation and "
+            "expose `__sycl_usm_array_interface__` property"
+        )
+    if q1 is None:
+        exec_q = q2
+        res_usm_type = test_usm_type
+    elif q2 is None:
+        exec_q = q1
+        res_usm_type = x_usm_type
+    else:
+        exec_q = dpt.get_execution_queue((q1, q2))
+        if exec_q is None:
+            raise dpt.ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        res_usm_type = dpt.get_coerced_usm_type(
+            (
+                x_usm_type,
+                test_usm_type,
+            )
+        )
+    dpt.validate_usm_type(res_usm_type, allow_none=False)
+    sycl_dev = exec_q.sycl_device
+
+    if not isinstance(invert, bool):
+        raise TypeError(
+            "`invert` keyword argument must be of boolean type, "
+            f"got {type(invert)}"
+        )
+
+    x_dt = _get_dtype(x, sycl_dev)
+    test_dt = _get_dtype(test_elements, sycl_dev)
+    if not all(_validate_dtype(dt) for dt in (x_dt, test_dt)):
+        raise ValueError("Operands have unsupported data types")
+
+    x_sh = _get_shape(x)
+    if isinstance(test_elements, dpt.usm_ndarray) and test_elements.size == 0:
+        if invert:
+            return dpt.ones(
+                x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q
+            )
+        else:
+            return dpt.zeros(
+                x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q
+            )
+
+    dt1, dt2 = _resolve_weak_types_all_py_ints(x_dt, test_dt, sycl_dev)
+    dt = _to_device_supported_dtype(dpt.result_type(dt1, dt2), sycl_dev)
+
+    if not isinstance(x, dpt.usm_ndarray):
+        x_arr = dpt.asarray(
+            x, dtype=dt1, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+    else:
+        x_arr = x
+
+    if not isinstance(test_elements, dpt.usm_ndarray):
+        test_arr = dpt.asarray(
+            test_elements, dtype=dt2, usm_type=res_usm_type, sycl_queue=exec_q
+        )
+    else:
+        test_arr = test_elements
+
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+
+    if x_dt != dt:
+        x_buf = _empty_like_orderK(x_arr, dt, res_usm_type, exec_q)
+        ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=x_arr, dst=x_buf, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, ev)
+    else:
+        x_buf = x_arr
+
+    if test_dt != dt:
+        # copy into C-contiguous memory, because the array will be flattened
+        test_buf = dpt.empty_like(
+            test_arr, dtype=dt, order="C", usm_type=res_usm_type
+        )
+        ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray(
+            src=test_arr, dst=test_buf, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, ev)
+    else:
+        test_buf = test_arr
+
+    test_buf = dpt.reshape(test_buf, -1)
+    test_buf = dpt.sort(test_buf)
+
+    dst = dpt.empty_like(
+        x_buf, dtype=dpt.bool, usm_type=res_usm_type, order="C"
+    )
+
+    dep_evs = _manager.submitted_events
+    ht_ev, s_ev = _isin(
+        needles=x_buf,
+        hay=test_buf,
+        dst=dst,
+        sycl_queue=exec_q,
+        invert=invert,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(ht_ev, s_ev)
+    return dst
diff --git a/dpnp/tensor/_slicing.pxi b/dpnp/tensor/_slicing.pxi
new file mode 100644
index 000000000000..f387aef8afd8
--- /dev/null
+++ b/dpnp/tensor/_slicing.pxi
@@ -0,0 +1,383 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numbers
+from operator import index
+from cpython.buffer cimport PyObject_CheckBuffer
+from numpy import ndarray
+
+
+cdef bint _is_buffer(object o):
+    return PyObject_CheckBuffer(o)
+
+
+cdef Py_ssize_t _slice_len(
+    Py_ssize_t sl_start,
+    Py_ssize_t sl_stop,
+    Py_ssize_t sl_step
+):
+    """
+    Compute len(range(sl_start, sl_stop, sl_step))
+    """
+    if sl_start == sl_stop:
+        return 0
+    if sl_step > 0:
+        if sl_start > sl_stop:
+            return 0
+        # 1 + argmax k such htat sl_start + sl_step*k < sl_stop
+        return 1 + ((sl_stop - sl_start - 1) // sl_step)
+    else:
+        if sl_start < sl_stop:
+            return 0
+        return 1 + ((sl_stop - sl_start + 1) // sl_step)
+
+
+cdef bint _is_integral(object x) except *:
+    """Gives True if x is an integral slice spec"""
+    if isinstance(x, (ndarray, usm_ndarray)):
+        if x.ndim > 0:
+            return False
+        if x.dtype.kind not in "ui":
+            return False
+        return True
+    if isinstance(x, bool):
+        return False
+    if isinstance(x, int):
+        return True
+    if _is_buffer(x):
+        mbuf = memoryview(x)
+        if mbuf.ndim == 0:
+            f = mbuf.format
+            return f in "bBhHiIlLqQ"
+        else:
+            return False
+    if callable(getattr(x, "__index__", None)):
+        try:
+            index(x)
+        except (TypeError, ValueError):
+            return False
+        return True
+    return False
+
+
+cdef bint _is_boolean(object x) except *:
+    """Gives True if x is an integral slice spec"""
+    if isinstance(x, (ndarray, usm_ndarray)):
+        if x.ndim > 0:
+            return False
+        if x.dtype.kind not in "b":
+            return False
+        return True
+    if isinstance(x, bool):
+        return True
+    if isinstance(x, (int, float, complex)):
+        return False
+    if _is_buffer(x):
+        mbuf = memoryview(x)
+        if mbuf.ndim == 0:
+            f = mbuf.format
+            return f in "?"
+        else:
+            return False
+    if callable(getattr(x, "__bool__", None)):
+        try:
+            x.__bool__()
+        except (TypeError, ValueError):
+            return False
+        return True
+    return False
+
+
+def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
+    """
+    Give basic slicing index `ind` and array layout information produce
+    a 5-tuple (resulting_shape, resulting_strides, resulting_offset,
+       advanced_ind, resulting_advanced_ind_pos)
+    used to construct a view into underlying array over which advanced
+    indexing, if any, is to be performed.
+
+    Raises IndexError for invalid index `ind`.
+    """
+    _no_advanced_ind = tuple()
+    _no_advanced_pos = -1
+    if ind is Ellipsis:
+        return (shape, strides, offset, _no_advanced_ind, _no_advanced_pos)
+    elif ind is None:
+        return (
+            (1,) + shape,
+            (0,) + strides,
+            offset,
+            _no_advanced_ind,
+            _no_advanced_pos,
+        )
+    elif isinstance(ind, slice):
+        sl_start, sl_stop, sl_step = ind.indices(shape[0])
+        sh0 = _slice_len(sl_start, sl_stop, sl_step)
+        str0 = sl_step * strides[0]
+        new_strides = (
+            strides if (sl_step == 1 or sh0 == 0) else (str0,) + strides[1:]
+        )
+        new_shape = (sh0, ) + shape[1:]
+        is_empty = any(sh_i == 0 for sh_i in new_shape)
+        new_offset = offset if is_empty else offset + sl_start * strides[0]
+        return (
+            new_shape,
+            new_strides,
+            new_offset,
+            _no_advanced_ind,
+            _no_advanced_pos,
+        )
+    elif _is_boolean(ind):
+        if ind:
+            return (
+                (1,) + shape,
+                (0,) + strides,
+                offset,
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+        else:
+            return (
+                (0,) + shape,
+                (0,) + strides,
+                offset,
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+    elif _is_integral(ind):
+        ind = index(ind)
+        new_shape = shape[1:]
+        new_strides = strides[1:]
+        is_empty = any(sh_i == 0 for sh_i in new_shape)
+        if 0 <= ind < shape[0]:
+            new_offset = offset if is_empty else offset + ind * strides[0]
+            return (
+                new_shape,
+                new_strides,
+                new_offset,
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+        elif -shape[0] <= ind < 0:
+            new_offset = (
+                offset if is_empty else offset + (shape[0] + ind) * strides[0]
+            )
+            return (
+                new_shape,
+                new_strides,
+                new_offset,
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+        else:
+            raise IndexError(
+                "Index {0} is out of range for axes 0 with "
+                "size {1}".format(ind, shape[0]))
+    elif isinstance(ind, (ndarray, usm_ndarray)):
+        return (shape, strides, offset, (ind,), 0)
+    elif isinstance(ind, tuple):
+        axes_referenced = 0
+        ellipses_count = 0
+        newaxis_count = 0
+        explicit_index = 0
+        seen_arrays_yet = False
+        array_streak_started = False
+        array_streak_interrupted = False
+        for i in ind:
+            if i is None:
+                newaxis_count += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif i is Ellipsis:
+                ellipses_count += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif isinstance(i, slice):
+                axes_referenced += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif _is_boolean(i):
+                newaxis_count += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif _is_integral(i):
+                axes_referenced += 1
+                if not array_streak_started and array_streak_interrupted:
+                    explicit_index += 1
+            elif isinstance(i, (ndarray, usm_ndarray)):
+                if not seen_arrays_yet:
+                    seen_arrays_yet = True
+                    array_streak_started = True
+                    array_streak_interrupted = False
+                if array_streak_interrupted:
+                    raise IndexError(
+                        "Advanced indexing array specs may not be "
+                        "separated by basic slicing specs."
+                    )
+                dt_k = i.dtype.kind
+                if dt_k == "b" and i.ndim > 0:
+                    axes_referenced += i.ndim
+                elif dt_k in "ui" and i.ndim > 0:
+                    axes_referenced += 1
+                else:
+                    raise IndexError(
+                        "arrays used as indices must be of integer "
+                        "(or boolean) type"
+                    )
+            else:
+                raise IndexError(
+                    "Only integers, slices (`:`), ellipsis (`...`), "
+                    "dpnp.tensor.newaxis (`None`) and integer and "
+                    "boolean arrays are valid indices."
+                )
+        if ellipses_count > 1:
+            raise IndexError(
+                "an index can only have a single ellipsis ('...')")
+        if axes_referenced > len(shape):
+            raise IndexError(
+                "too many indices for an array, array is "
+                "{0}-dimensional, but {1} were indexed".format(
+                    len(shape), axes_referenced))
+        if ellipses_count:
+            ellipses_count = len(shape) - axes_referenced
+        new_shape_len = (newaxis_count + ellipses_count
+                         + axes_referenced - explicit_index)
+        new_shape = list()
+        new_strides = list()
+        new_advanced_ind = list()
+        k = 0
+        new_advanced_start_pos = -1
+        advanced_start_pos_set = False
+        new_offset = offset
+        is_empty = False
+        array_streak = False
+        for i in range(len(ind)):
+            ind_i = ind[i]
+            if (ind_i is Ellipsis):
+                k_new = k + ellipses_count
+                new_shape.extend(shape[k:k_new])
+                new_strides.extend(strides[k:k_new])
+                if any(dim == 0 for dim in shape[k:k_new]):
+                    is_empty = True
+                    new_offset = offset
+                k = k_new
+                if array_streak:
+                    array_streak = False
+            elif ind_i is None:
+                new_shape.append(1)
+                new_strides.append(0)
+                if array_streak:
+                    array_streak = False
+            elif isinstance(ind_i, slice):
+                k_new = k + 1
+                sl_start, sl_stop, sl_step = ind_i.indices(shape[k])
+                sh_i = _slice_len(sl_start, sl_stop, sl_step)
+                str_i = (1 if sh_i == 0 else sl_step) * strides[k]
+                new_shape.append(sh_i)
+                new_strides.append(str_i)
+                if sh_i > 0 and not is_empty:
+                    new_offset = new_offset + sl_start * strides[k]
+                if sh_i == 0:
+                    is_empty = True
+                    new_offset = offset
+                k = k_new
+                if array_streak:
+                    array_streak = False
+            elif _is_boolean(ind_i):
+                new_shape.append(1 if ind_i else 0)
+                new_strides.append(0)
+                if array_streak:
+                    array_streak = False
+            elif _is_integral(ind_i):
+                if array_streak:
+                    if not isinstance(ind_i, (ndarray, usm_ndarray)):
+                        ind_i = index(ind_i)
+                        # integer will be converted to an array,
+                        # still raise if OOB
+                        if not (
+                            0 <= ind_i < shape[k] or -shape[k] <= ind_i < 0
+                        ):
+                            raise IndexError(
+                                "Index {0} is out of range for axes "
+                                "{1} with size {2}".format(ind_i, k, shape[k])
+                            )
+                    new_advanced_ind.append(ind_i)
+                    k_new = k + 1
+                    new_shape.extend(shape[k:k_new])
+                    new_strides.extend(strides[k:k_new])
+                    k = k_new
+                else:
+                    ind_i = index(ind_i)
+                    if 0 <= ind_i < shape[k]:
+                        k_new = k + 1
+                        if not is_empty:
+                            new_offset = new_offset + ind_i * strides[k]
+                        k = k_new
+                    elif -shape[k] <= ind_i < 0:
+                        k_new = k + 1
+                        if not is_empty:
+                            new_offset = (
+                                new_offset + (shape[k] + ind_i) * strides[k]
+                            )
+                        k = k_new
+                    else:
+                        raise IndexError(
+                            "Index {0} is out of range for axes "
+                            "{1} with size {2}".format(ind_i, k, shape[k])
+                        )
+            elif isinstance(ind_i, (ndarray, usm_ndarray)):
+                if not array_streak:
+                    array_streak = True
+                if not advanced_start_pos_set:
+                    new_advanced_start_pos = len(new_shape)
+                    advanced_start_pos_set = True
+                new_advanced_ind.append(ind_i)
+                dt_k = ind_i.dtype.kind
+                if dt_k == "b":
+                    k_new = k + ind_i.ndim
+                else:
+                    k_new = k + 1
+                new_shape.extend(shape[k:k_new])
+                new_strides.extend(strides[k:k_new])
+                k = k_new
+        new_shape.extend(shape[k:])
+        new_strides.extend(strides[k:])
+        new_shape_len += len(shape) - k
+        return (
+            tuple(new_shape),
+            tuple(new_strides),
+            new_offset,
+            tuple(new_advanced_ind),
+            new_advanced_start_pos
+        )
+    else:
+        raise IndexError(
+            "Only integers, slices (`:`), ellipsis (`...`), "
+            "dpnp.tensor.newaxis (`None`) and integer and "
+            "boolean arrays are valid indices."
+        )
diff --git a/dpnp/tensor/_sorting.py b/dpnp/tensor/_sorting.py
new file mode 100644
index 000000000000..c912b4f77cdf
--- /dev/null
+++ b/dpnp/tensor/_sorting.py
@@ -0,0 +1,441 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+from typing import NamedTuple
+
+import dpctl.utils as du
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+
+from ._numpy_helper import normalize_axis_index
+from ._tensor_sorting_impl import (
+    _argsort_ascending,
+    _argsort_descending,
+    _radix_argsort_ascending,
+    _radix_argsort_descending,
+    _radix_sort_ascending,
+    _radix_sort_descending,
+    _radix_sort_dtype_supported,
+    _sort_ascending,
+    _sort_descending,
+    _topk,
+)
+
+__all__ = ["sort", "argsort", "top_k"]
+
+
+def _get_mergesort_impl_fn(descending):
+    return _sort_descending if descending else _sort_ascending
+
+
+def _get_radixsort_impl_fn(descending):
+    return _radix_sort_descending if descending else _radix_sort_ascending
+
+
+def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
+    """sort(x, axis=-1, descending=False, stable=True)
+
+    Returns a sorted copy of an input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to sort. If set to `-1`, the function
+            must sort along the last axis. Default: `-1`.
+        descending (Optional[bool]):
+            sort order. If `True`, the array must be sorted in descending
+            order (by value). If `False`, the array must be sorted in
+            ascending order (by value). Default: `False`.
+        stable (Optional[bool]):
+            sort stability. If `True`, the returned array must maintain the
+            relative order of `x` values which compare as equal. If `False`,
+            the returned array may or may not maintain the relative order of
+            `x` values which compare as equal. Default: `True`.
+        kind (Optional[Literal["stable", "mergesort", "radixsort"]]):
+            Sorting algorithm. The default is `"stable"`, which uses parallel
+            merge-sort or parallel radix-sort algorithms depending on the
+            array data type.
+    Returns:
+        usm_ndarray:
+            a sorted array. The returned array has the same data type and
+            the same shape as the input array `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected type dpnp.tensor.usm_ndarray, got {type(x)}")
+    nd = x.ndim
+    if nd == 0:
+        axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
+        return dpt.copy(x, order="C")
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt.permute_dims(x, perm)
+    if kind is None:
+        kind = "stable"
+    if not isinstance(kind, str) or kind not in [
+        "stable",
+        "radixsort",
+        "mergesort",
+    ]:
+        raise ValueError(
+            "Unsupported kind value. Expected 'stable', 'mergesort', "
+            f"or 'radixsort', but got '{kind}'"
+        )
+    if kind == "mergesort":
+        impl_fn = _get_mergesort_impl_fn(descending)
+    elif kind == "radixsort":
+        if _radix_sort_dtype_supported(x.dtype.num):
+            impl_fn = _get_radixsort_impl_fn(descending)
+        else:
+            raise ValueError(f"Radix sort is not supported for {x.dtype}")
+    else:
+        dt = x.dtype
+        if dt in [dpt.bool, dpt.uint8, dpt.int8, dpt.int16, dpt.uint16]:
+            impl_fn = _get_radixsort_impl_fn(descending)
+        else:
+            impl_fn = _get_mergesort_impl_fn(descending)
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    if arr.flags.c_contiguous:
+        res = dpt.empty_like(arr, order="C")
+        ht_ev, impl_ev = impl_fn(
+            src=arr,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    else:
+        tmp = dpt.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        res = dpt.empty_like(arr, order="C")
+        ht_ev, impl_ev = impl_fn(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(res, inv_perm)
+    return res
+
+
+def _get_mergeargsort_impl_fn(descending):
+    return _argsort_descending if descending else _argsort_ascending
+
+
+def _get_radixargsort_impl_fn(descending):
+    return _radix_argsort_descending if descending else _radix_argsort_ascending
+
+
+def argsort(x, axis=-1, descending=False, stable=True, kind=None):
+    """argsort(x, axis=-1, descending=False, stable=True)
+
+    Returns the indices that sort an array `x` along a specified axis.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to sort. If set to `-1`, the function
+            must sort along the last axis. Default: `-1`.
+        descending (Optional[bool]):
+            sort order. If `True`, the array must be sorted in descending
+            order (by value). If `False`, the array must be sorted in
+            ascending order (by value). Default: `False`.
+        stable (Optional[bool]):
+            sort stability. If `True`, the returned array must maintain the
+            relative order of `x` values which compare as equal. If `False`,
+            the returned array may or may not maintain the relative order of
+            `x` values which compare as equal. Default: `True`.
+        kind (Optional[Literal["stable", "mergesort", "radixsort"]]):
+            Sorting algorithm. The default is `"stable"`, which uses parallel
+            merge-sort or parallel radix-sort algorithms depending on the
+            array data type.
+
+    Returns:
+        usm_ndarray:
+            an array of indices. The returned array has the  same shape as
+            the input array `x`. The return array has default array index
+            data type.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected type dpnp.tensor.usm_ndarray, got {type(x)}")
+    nd = x.ndim
+    if nd == 0:
+        axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
+        return dpt.zeros_like(
+            x, dtype=ti.default_device_index_type(x.sycl_queue), order="C"
+        )
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+    a1 = axis + 1
+    if a1 == nd:
+        perm = list(range(nd))
+        arr = x
+    else:
+        perm = [i for i in range(nd) if i != axis] + [
+            axis,
+        ]
+        arr = dpt.permute_dims(x, perm)
+    if kind is None:
+        kind = "stable"
+    if not isinstance(kind, str) or kind not in [
+        "stable",
+        "radixsort",
+        "mergesort",
+    ]:
+        raise ValueError(
+            "Unsupported kind value. Expected 'stable', 'mergesort', "
+            f"or 'radixsort', but got '{kind}'"
+        )
+    if kind == "mergesort":
+        impl_fn = _get_mergeargsort_impl_fn(descending)
+    elif kind == "radixsort":
+        if _radix_sort_dtype_supported(x.dtype.num):
+            impl_fn = _get_radixargsort_impl_fn(descending)
+        else:
+            raise ValueError(f"Radix sort is not supported for {x.dtype}")
+    else:
+        dt = x.dtype
+        if dt in [dpt.bool, dpt.uint8, dpt.int8, dpt.int16, dpt.uint16]:
+            impl_fn = _get_radixargsort_impl_fn(descending)
+        else:
+            impl_fn = _get_mergeargsort_impl_fn(descending)
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    index_dt = ti.default_device_index_type(exec_q)
+    if arr.flags.c_contiguous:
+        res = dpt.empty_like(arr, dtype=index_dt, order="C")
+        ht_ev, impl_ev = impl_fn(
+            src=arr,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    else:
+        tmp = dpt.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        res = dpt.empty_like(arr, dtype=index_dt, order="C")
+        ht_ev, impl_ev = impl_fn(
+            src=tmp,
+            trailing_dims_to_sort=1,
+            dst=res,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    if a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(res, inv_perm)
+    return res
+
+
+def _get_top_k_largest(mode):
+    modes = {"largest": True, "smallest": False}
+    try:
+        return modes[mode]
+    except KeyError:
+        raise ValueError(
+            f"`mode` must be `largest` or `smallest`. Got `{mode}`."
+        )
+
+
+class TopKResult(NamedTuple):
+    values: dpt.usm_ndarray
+    indices: dpt.usm_ndarray
+
+
+def top_k(x, k, /, *, axis=None, mode="largest"):
+    """top_k(x, k, axis=None, mode="largest")
+
+    Returns the `k` largest or smallest values and their indices in the input
+    array `x` along the specified axis `axis`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        k (int):
+            number of elements to find. Must be a positive integer value.
+        axis (Optional[int]):
+            axis along which to search. If `None`, the search will be performed
+            over the flattened array. Default: ``None``.
+        mode (Literal["largest", "smallest"]):
+            search mode. Must be one of the following modes:
+
+            - `"largest"`: return the `k` largest elements.
+            - `"smallest"`: return the `k` smallest elements.
+
+            Default: `"largest"`.
+
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, indices)` whose
+
+            * first element `values` will be an array containing the `k`
+              largest or smallest elements of `x`. The array has the same data
+              type as `x`. If `axis` was `None`, `values` will be a
+              one-dimensional array with shape `(k,)` and otherwise, `values`
+              will have shape `x.shape[:axis] + (k,) + x.shape[axis+1:]`
+            * second element `indices` will be an array containing indices of
+              `x` that result in `values`. The array will have the same shape
+              as `values` and will have the default array index data type.
+    """
+    largest = _get_top_k_largest(mode)
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected type dpnp.tensor.usm_ndarray, got {type(x)}")
+
+    k = operator.index(k)
+    if k < 0:
+        raise ValueError("`k` must be a positive integer value")
+
+    nd = x.ndim
+    if axis is None:
+        sz = x.size
+        if nd == 0:
+            if k > 1:
+                raise ValueError(f"`k`={k} is out of bounds 1")
+            return TopKResult(
+                dpt.copy(x, order="C"),
+                dpt.zeros_like(
+                    x, dtype=ti.default_device_index_type(x.sycl_queue)
+                ),
+            )
+        arr = x
+        n_search_dims = None
+        res_sh = k
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+        sz = x.shape[axis]
+        a1 = axis + 1
+        if a1 == nd:
+            perm = list(range(nd))
+            arr = x
+        else:
+            perm = [i for i in range(nd) if i != axis] + [
+                axis,
+            ]
+            arr = dpt.permute_dims(x, perm)
+        n_search_dims = 1
+        res_sh = arr.shape[: nd - 1] + (k,)
+
+    if k > sz:
+        raise ValueError(f"`k`={k} is out of bounds {sz}")
+
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+
+    res_usm_type = arr.usm_type
+    if arr.flags.c_contiguous:
+        vals = dpt.empty(
+            res_sh,
+            dtype=arr.dtype,
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        inds = dpt.empty(
+            res_sh,
+            dtype=ti.default_device_index_type(exec_q),
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        ht_ev, impl_ev = _topk(
+            src=arr,
+            trailing_dims_to_search=n_search_dims,
+            k=k,
+            largest=largest,
+            vals=vals,
+            inds=inds,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    else:
+        tmp = dpt.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        vals = dpt.empty(
+            res_sh,
+            dtype=arr.dtype,
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        inds = dpt.empty(
+            res_sh,
+            dtype=ti.default_device_index_type(exec_q),
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        ht_ev, impl_ev = _topk(
+            src=tmp,
+            trailing_dims_to_search=n_search_dims,
+            k=k,
+            largest=largest,
+            vals=vals,
+            inds=inds,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    if axis is not None and a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        vals = dpt.permute_dims(vals, inv_perm)
+        inds = dpt.permute_dims(inds, inv_perm)
+
+    return TopKResult(vals, inds)
diff --git a/dpnp/tensor/_statistical_functions.py b/dpnp/tensor/_statistical_functions.py
new file mode 100644
index 000000000000..a2015488aff2
--- /dev/null
+++ b/dpnp/tensor/_statistical_functions.py
@@ -0,0 +1,379 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+
+import dpctl.utils as du
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_elementwise_impl as tei
+import dpnp.tensor._tensor_impl as ti
+import dpnp.tensor._tensor_reductions_impl as tri
+
+from ._numpy_helper import normalize_axis_tuple
+
+
+def _var_impl(x, axis, correction, keepdims):
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+    if not isinstance(axis, (tuple, list)):
+        axis = (axis,)
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    perm = []
+    nelems = 1
+    for i in range(nd):
+        if i not in axis:
+            perm.append(i)
+        else:
+            nelems *= x.shape[i]
+    red_nd = len(axis)
+    perm = perm + list(axis)
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    res_dt = (
+        inp_dt
+        if inp_dt.kind == "f"
+        else dpt.dtype(ti.default_device_fp_type(q))
+    )
+    res_usm_type = x.usm_type
+
+    _manager = du.SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    if inp_dt != res_dt:
+        buf = dpt.empty_like(x, dtype=res_dt)
+        ht_e_buf, c_e1 = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=x, dst=buf, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_buf, c_e1)
+    else:
+        buf = x
+    # calculate mean
+    buf2 = dpt.permute_dims(buf, perm)
+    res_shape = buf2.shape[: nd - red_nd]
+    # use keepdims=True path for later broadcasting
+    if red_nd == 0:
+        mean_ary = dpt.empty_like(buf)
+        dep_evs = _manager.submitted_events
+        ht_e1, c_e2 = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=buf, dst=mean_ary, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e1, c_e2)
+    else:
+        mean_ary = dpt.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=q,
+        )
+        dep_evs = _manager.submitted_events
+        ht_e1, r_e1 = tri._sum_over_axis(
+            src=buf2,
+            trailing_dims_to_reduce=red_nd,
+            dst=mean_ary,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_e1, r_e1)
+
+        mean_ary_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        mean_ary = dpt.permute_dims(
+            dpt.reshape(mean_ary, mean_ary_shape), inv_perm
+        )
+    # divide in-place to get mean
+    mean_ary_shape = mean_ary.shape
+
+    dep_evs = _manager.submitted_events
+    ht_e2, d_e1 = tei._divide_by_scalar(
+        src=mean_ary, scalar=nelems, dst=mean_ary, sycl_queue=q, depends=dep_evs
+    )
+    _manager.add_event_pair(ht_e2, d_e1)
+
+    # subtract mean from original array to get deviations
+    dev_ary = dpt.empty_like(buf)
+    if mean_ary_shape != buf.shape:
+        mean_ary = dpt.broadcast_to(mean_ary, buf.shape)
+    ht_e4, su_e = tei._subtract(
+        src1=buf, src2=mean_ary, dst=dev_ary, sycl_queue=q, depends=[d_e1]
+    )
+    _manager.add_event_pair(ht_e4, su_e)
+    # square deviations
+    ht_e5, sq_e = tei._square(
+        src=dev_ary, dst=dev_ary, sycl_queue=q, depends=[su_e]
+    )
+    _manager.add_event_pair(ht_e5, sq_e)
+
+    # take sum of squared deviations
+    dev_ary2 = dpt.permute_dims(dev_ary, perm)
+    if red_nd == 0:
+        res = dev_ary
+    else:
+        res = dpt.empty(
+            res_shape,
+            dtype=res_dt,
+            usm_type=res_usm_type,
+            sycl_queue=q,
+        )
+        ht_e6, r_e2 = tri._sum_over_axis(
+            src=dev_ary2,
+            trailing_dims_to_reduce=red_nd,
+            dst=res,
+            sycl_queue=q,
+            depends=[sq_e],
+        )
+        _manager.add_event_pair(ht_e6, r_e2)
+
+        if keepdims:
+            res_shape = res_shape + (1,) * red_nd
+            inv_perm = sorted(range(nd), key=lambda d: perm[d])
+            res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
+    res_shape = res.shape
+    # when nelems - correction <= 0, yield nans
+    div = max(nelems - correction, 0)
+    if not div:
+        div = dpt.nan
+    dep_evs = _manager.submitted_events
+    ht_e7, d_e2 = tei._divide_by_scalar(
+        src=res, scalar=div, dst=res, sycl_queue=q, depends=dep_evs
+    )
+    _manager.add_event_pair(ht_e7, d_e2)
+    return res, [d_e2]
+
+
+def mean(x, axis=None, keepdims=False):
+    """mean(x, axis=None, keepdims=False)
+
+    Calculates the arithmetic mean of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which the arithmetic means must be computed. If
+            a tuple of unique integers, the means are computed over multiple
+            axes. If `None`, the mean is computed over the entire array.
+            Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input array according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the arithmetic means. If the mean was computed
+            over the entire array, a zero-dimensional array is returned.
+
+            If `x` has a floating-point data type, the returned array will have
+            the same data type as `x`.
+            If `x` has a boolean or integral data type, the returned array
+            will have the default floating point data type for the device
+            where input array `x` is allocated.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+    nd = x.ndim
+    if axis is None:
+        axis = tuple(range(nd))
+    if not isinstance(axis, (tuple, list)):
+        axis = (axis,)
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    perm = []
+    nelems = 1
+    for i in range(nd):
+        if i not in axis:
+            perm.append(i)
+        else:
+            nelems *= x.shape[i]
+    sum_nd = len(axis)
+    perm = perm + list(axis)
+    arr2 = dpt.permute_dims(x, perm)
+    res_shape = arr2.shape[: nd - sum_nd]
+    q = x.sycl_queue
+    inp_dt = x.dtype
+    res_dt = (
+        x.dtype
+        if x.dtype.kind in "fc"
+        else dpt.dtype(ti.default_device_fp_type(q))
+    )
+    res_usm_type = x.usm_type
+    if sum_nd == 0:
+        return dpt.astype(x, res_dt, copy=True)
+
+    _manager = du.SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q):
+        res = dpt.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        ht_e1, r_e = tri._sum_over_axis(
+            src=arr2,
+            trailing_dims_to_reduce=sum_nd,
+            dst=res,
+            sycl_queue=q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_e1, r_e)
+    else:
+        tmp = dpt.empty(
+            arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr2, dst=tmp, sycl_queue=q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_e_cpy, cpy_e)
+        res = dpt.empty(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
+        )
+        ht_e_red, r_e = tri._sum_over_axis(
+            src=tmp,
+            trailing_dims_to_reduce=sum_nd,
+            dst=res,
+            sycl_queue=q,
+            depends=[cpy_e],
+        )
+        _manager.add_event_pair(ht_e_red, r_e)
+
+    if keepdims:
+        res_shape = res_shape + (1,) * sum_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
+
+    dep_evs = _manager.submitted_events
+    ht_e2, div_e = tei._divide_by_scalar(
+        src=res, scalar=nelems, dst=res, sycl_queue=q, depends=dep_evs
+    )
+    _manager.add_event_pair(ht_e2, div_e)
+    return res
+
+
+def var(x, axis=None, correction=0.0, keepdims=False):
+    """var(x, axis=None, correction=0.0, keepdims=False)
+
+    Calculates the variance of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which the variances must be computed. If a tuple
+            of unique integers, the variances are computed over multiple axes.
+            If `None`, the variance is computed over the entire array.
+            Default: `None`.
+        correction (Optional[float, int]):
+            degrees of freedom adjustment. The divisor used in calculating the
+            variance is `N - correction`, where `N` corresponds to the total
+            number of elements over which the variance is calculated.
+            Default: `0.0`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input array according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the variances. If the variance was computed
+            over the entire array, a zero-dimensional array is returned.
+
+            If `x` has a real-valued floating-point data type, the returned
+            array will have the same data type as `x`.
+            If `x` has a boolean or integral data type, the returned array
+            will have the default floating point data type for the device
+            where input array `x` is allocated.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+
+    if not isinstance(correction, (int, float)):
+        raise TypeError(
+            "Expected a Python integer or float for `correction`, got"
+            f"{type(x)}"
+        )
+
+    if x.dtype.kind == "c":
+        raise ValueError("`var` does not support complex types")
+
+    res, _ = _var_impl(x, axis, correction, keepdims)
+    return res
+
+
+def std(x, axis=None, correction=0.0, keepdims=False):
+    """std(x, axis=None, correction=0.0, keepdims=False)
+
+    Calculates the standard deviation of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which the standard deviations must be computed.
+            If a tuple of unique integers, the standard deviations are computed
+            over multiple axes. If `None`, the standard deviation is computed
+            over the entire array. Default: `None`.
+        correction (Optional[float, int]):
+            degrees of freedom adjustment. The divisor used in calculating the
+            standard deviation is `N - correction`, where `N` corresponds to the
+            total number of elements over which the standard deviation is
+            calculated. Default: `0.0`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input array according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the standard deviations. If the standard
+            deviation was computed over the entire array, a zero-dimensional
+            array is returned.
+
+            If `x` has a real-valued floating-point data type, the returned
+            array will have the same data type as `x`.
+            If `x` has a boolean or integral data type, the returned array
+            will have the default floating point data type for the device
+            where input array `x` is allocated.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+
+    if not isinstance(correction, (int, float)):
+        raise TypeError(
+            "Expected a Python integer or float for `correction`,"
+            f"got {type(x)}"
+        )
+
+    if x.dtype.kind == "c":
+        raise ValueError("`std` does not support complex types")
+
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    res, deps = _var_impl(x, axis, correction, keepdims)
+    ht_ev, sqrt_ev = tei._sqrt(
+        src=res, dst=res, sycl_queue=exec_q, depends=deps
+    )
+    _manager.add_event_pair(ht_ev, sqrt_ev)
+    return res
diff --git a/dpnp/tensor/_stride_utils.pxi b/dpnp/tensor/_stride_utils.pxi
new file mode 100644
index 000000000000..3caf8dd8fd1f
--- /dev/null
+++ b/dpnp/tensor/_stride_utils.pxi
@@ -0,0 +1,314 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+
+from cpython.mem cimport PyMem_Malloc
+from cpython.ref cimport Py_INCREF
+from cpython.tuple cimport PyTuple_New, PyTuple_SetItem
+
+
+cdef int ERROR_MALLOC = 1
+cdef int ERROR_INTERNAL = -1
+cdef int ERROR_INCORRECT_ORDER = 2
+cdef int ERROR_UNEXPECTED_STRIDES = 3
+
+cdef int USM_ARRAY_C_CONTIGUOUS = 1
+cdef int USM_ARRAY_F_CONTIGUOUS = 2
+cdef int USM_ARRAY_WRITABLE = 4
+
+
+cdef Py_ssize_t shape_to_elem_count(int nd, Py_ssize_t *shape_arr):
+    """
+    Computes number of elements in an array.
+    """
+    cdef Py_ssize_t count = 1
+    for i in range(nd):
+        count *= shape_arr[i]
+    return count
+
+
+cdef int _from_input_shape_strides(
+    int nd, object shape, object strides, int itemsize, char order,
+    Py_ssize_t **shape_ptr, Py_ssize_t **strides_ptr,
+    Py_ssize_t *nelems, Py_ssize_t *min_disp, Py_ssize_t *max_disp,
+    int *contig
+):
+    """
+    Arguments: nd, shape, strides, itemsize, order
+    Modifies:
+        shape_ptr - pointer to C array for shape values
+        stride_ptr - pointer to C array for strides values
+        nelems - Number of elements in array
+        min_disp = min( dot(strides, index), index for shape)
+        max_disp = max( dor(strides, index), index for shape)
+        contig = enumeration for array contiguity
+    Returns: 0 on success, error code otherwise.
+        On success pointers point to allocated arrays,
+        Otherwise they are set to NULL
+    """
+    cdef int i
+    cdef int j
+    cdef bint all_incr = 1
+    cdef bint all_decr = 1
+    cdef bint strides_inspected = 0
+    cdef Py_ssize_t elem_count = 1
+    cdef Py_ssize_t min_shift = 0
+    cdef Py_ssize_t max_shift = 0
+    cdef Py_ssize_t str_i
+    cdef Py_ssize_t* shape_arr
+    cdef Py_ssize_t* strides_arr
+
+    if (int(order) not in [ord("C"), ord("F"), ord("c"), ord("f")]):
+        return ERROR_INCORRECT_ORDER
+
+    # 0-d array
+    if (nd == 0):
+        contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
+        nelems[0] = 1
+        min_disp[0] = 0
+        max_disp[0] = 0
+        shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        strides_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        return 0
+
+    shape_arr = <Py_ssize_t*>PyMem_Malloc(nd * sizeof(Py_ssize_t))
+    if (not shape_arr):
+        return ERROR_MALLOC
+    shape_ptr[0] = shape_arr
+    for i in range(0, nd):
+        shape_arr[i] = <Py_ssize_t> shape[i]
+        elem_count *= shape_arr[i]
+    if elem_count == 0:
+        contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
+        nelems[0] = 1
+        min_disp[0] = 0
+        max_disp[0] = 0
+        if strides is None:
+            strides_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        else:
+            strides_arr = <Py_ssize_t*>PyMem_Malloc(nd * sizeof(Py_ssize_t))
+            if (not strides_arr):
+                PyMem_Free(shape_ptr[0])
+                shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
+                return ERROR_MALLOC
+            strides_ptr[0] = strides_arr
+            for i in range(0, nd):
+                strides_arr[i] = <Py_ssize_t> strides[i]
+        return 0
+    nelems[0] = elem_count
+    if (strides is None):
+        # no need to allocate and populate strides
+        if order == <char> ord("C") or order == <char> ord("c"):
+            contig[0] = USM_ARRAY_C_CONTIGUOUS
+        else:
+            contig[0] = USM_ARRAY_F_CONTIGUOUS
+        if nd == 1:
+            contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS
+        else:
+            j = 0
+            for i in range(nd):
+                if shape_arr[i] > 1:
+                    j = j + 1
+            if j < 2:
+                contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS
+        min_disp[0] = 0
+        max_disp[0] = (elem_count - 1)
+        strides_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        return 0
+    elif ((isinstance(strides, (list, tuple)) or hasattr(strides, "tolist"))
+          and len(strides) == nd):
+        strides_arr = <Py_ssize_t*>PyMem_Malloc(nd * sizeof(Py_ssize_t))
+        if (not strides_arr):
+            PyMem_Free(shape_ptr[0])
+            shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
+            return ERROR_MALLOC
+        strides_ptr[0] = strides_arr
+        for i in range(0, nd):
+            str_i = <Py_ssize_t> strides[i]
+            strides_arr[i] = str_i
+            if str_i > 0:
+                max_shift += str_i * (shape_arr[i] - 1)
+            else:
+                min_shift += str_i * (shape_arr[i] - 1)
+        min_disp[0] = min_shift
+        max_disp[0] = max_shift
+        if max_shift == min_shift + (elem_count - 1):
+            if elem_count == 1:
+                contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
+                return 0
+            if nd == 1:
+                if strides_arr[0] == 1:
+                    contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS
+                else:
+                    contig[0] = 0
+                return 0
+            i = 0
+            while i < nd:
+                if shape_arr[i] == 1:
+                    i = i + 1
+                    continue
+                j = i + 1
+                while (j < nd and shape_arr[j] == 1):
+                    j = j + 1
+                if j < nd:
+                    strides_inspected = 1
+                    if all_incr:
+                        all_incr = (
+                            (strides_arr[i] > 0) and
+                            (strides_arr[j] > 0) and
+                            (strides_arr[i] <= strides_arr[j])
+                        )
+                    if all_decr:
+                        all_decr = (
+                            (strides_arr[i] > 0) and
+                            (strides_arr[j] > 0) and
+                            (strides_arr[i] >= strides_arr[j])
+                        )
+                    i = j
+                else:
+                    if not strides_inspected:
+                        # all dimensions have size 1 except
+                        # dimension 'i'. Array is both C and F
+                        # contiguous
+                        strides_inspected = 1
+                        all_incr = (strides_arr[i] == 1)
+                        all_decr = all_incr
+                    break
+            # should only set contig flags on actually obtained
+            # values, rather than default values
+            all_incr = all_incr and strides_inspected
+            all_decr = all_decr and strides_inspected
+            if all_incr and all_decr:
+                contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
+            elif all_incr:
+                contig[0] = USM_ARRAY_F_CONTIGUOUS
+            elif all_decr:
+                contig[0] = USM_ARRAY_C_CONTIGUOUS
+            else:
+                contig[0] = 0
+            return 0
+        else:
+            contig[0] = 0  # non-contiguous
+        return 0
+    else:
+        PyMem_Free(shape_ptr[0])
+        shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        return ERROR_UNEXPECTED_STRIDES
+    # return ERROR_INTERNAL
+
+
+cdef object _make_int_tuple(int nd, const Py_ssize_t *ary):
+    """
+    Makes Python tuple from C array
+    """
+    cdef tuple res
+    cdef object tmp
+    if (ary):
+        res = PyTuple_New(nd)
+        for i in range(nd):
+            tmp = <object>ary[i]
+            Py_INCREF(tmp)  # SetItem steals the reference
+            PyTuple_SetItem(res, i, tmp)
+        return res
+    else:
+        return None
+
+
+cdef object _make_reversed_int_tuple(int nd, const Py_ssize_t *ary):
+    """
+    Makes Python reversed tuple from C array
+    """
+    cdef tuple res
+    cdef object tmp
+    cdef int i
+    cdef int nd_1
+    if (ary):
+        res = PyTuple_New(nd)
+        nd_1 = nd - 1
+        for i in range(nd):
+            tmp = <object>ary[i]
+            Py_INCREF(tmp)  # SetItem steals the reference
+            PyTuple_SetItem(res, nd_1 - i, tmp)
+        return res
+    else:
+        return None
+
+
+cdef object _c_contig_strides(int nd, Py_ssize_t *shape):
+    """
+    Makes Python tuple for strides of C-contiguous array
+    """
+    cdef tuple cc_strides = PyTuple_New(nd)
+    cdef object si = 1
+    cdef int i
+    cdef int nd_1 = nd - 1
+    for i in range(0, nd):
+        Py_INCREF(si)  # SetItem steals the reference
+        PyTuple_SetItem(cc_strides, nd_1 - i, si)
+        si = si * shape[nd_1 - i]
+    return cc_strides
+
+
+cdef object _f_contig_strides(int nd, Py_ssize_t *shape):
+    """
+    Makes Python tuple for strides of F-contiguous array
+    """
+    cdef tuple fc_strides = PyTuple_New(nd)
+    cdef object si = 1
+    for i in range(0, nd):
+        Py_INCREF(si)  # SetItem steals the reference
+        PyTuple_SetItem(fc_strides, i, si)
+        si = si * shape[i]
+    return fc_strides
+
+cdef object _swap_last_two(tuple t):
+    """
+    Swap last two elements of a tuple
+    """
+    cdef int nd = len(t)
+    cdef tuple res
+    cdef int i
+    cdef object tmp
+    if (nd < 2):
+        return t
+    res = PyTuple_New(nd)
+    # copy all elements except the last two
+    for i in range(0, nd-2):
+        tmp = t[i]
+        Py_INCREF(tmp)  # SetItem steals the reference
+        PyTuple_SetItem(res, i, tmp)
+    # swap the last two elements
+    tmp = t[nd-1]
+    Py_INCREF(tmp)  # SetItem steals
+    PyTuple_SetItem(res, nd - 2, tmp)
+    tmp = t[nd-2]
+    Py_INCREF(tmp)  # SetItem steals
+    PyTuple_SetItem(res, nd - 1, tmp)
+    return res
diff --git a/dpnp/tensor/_testing.py b/dpnp/tensor/_testing.py
new file mode 100644
index 000000000000..fbec13fdeb36
--- /dev/null
+++ b/dpnp/tensor/_testing.py
@@ -0,0 +1,163 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+
+import dpnp.tensor as dpt
+
+from ._manipulation_functions import _broadcast_shape_impl
+from ._type_utils import _to_device_supported_dtype
+
+
+def _allclose_complex_fp(z1, z2, atol, rtol, equal_nan):
+    z1r = dpt.real(z1)
+    z1i = dpt.imag(z1)
+    z2r = dpt.real(z2)
+    z2i = dpt.imag(z2)
+    if equal_nan:
+        check1 = dpt.all(dpt.isnan(z1r) == dpt.isnan(z2r)) and dpt.all(
+            dpt.isnan(z1i) == dpt.isnan(z2i)
+        )
+    else:
+        check1 = (
+            dpt.logical_not(dpt.any(dpt.isnan(z1r)))
+            and dpt.logical_not(dpt.any(dpt.isnan(z1i)))
+        ) and (
+            dpt.logical_not(dpt.any(dpt.isnan(z2r)))
+            and dpt.logical_not(dpt.any(dpt.isnan(z2i)))
+        )
+    if not check1:
+        return check1
+    mr = dpt.isinf(z1r)
+    mi = dpt.isinf(z1i)
+    check2 = dpt.all(mr == dpt.isinf(z2r)) and dpt.all(mi == dpt.isinf(z2i))
+    if not check2:
+        return check2
+    check3 = dpt.all(z1r[mr] == z2r[mr]) and dpt.all(z1i[mi] == z2i[mi])
+    if not check3:
+        return check3
+    mr = dpt.isfinite(z1r)
+    mi = dpt.isfinite(z1i)
+    mv1 = z1r[mr]
+    mv2 = z2r[mr]
+    check4 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        < dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
+    )
+    if not check4:
+        return check4
+    mv1 = z1i[mi]
+    mv2 = z2i[mi]
+    check5 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
+    )
+    return check5
+
+
+def _allclose_real_fp(r1, r2, atol, rtol, equal_nan):
+    if equal_nan:
+        check1 = dpt.all(dpt.isnan(r1) == dpt.isnan(r2))
+    else:
+        check1 = dpt.logical_not(dpt.any(dpt.isnan(r1))) and dpt.logical_not(
+            dpt.any(dpt.isnan(r2))
+        )
+    if not check1:
+        return check1
+    mr = dpt.isinf(r1)
+    check2 = dpt.all(mr == dpt.isinf(r2))
+    if not check2:
+        return check2
+    check3 = dpt.all(r1[mr] == r2[mr])
+    if not check3:
+        return check3
+    m = dpt.isfinite(r1)
+    mv1 = r1[m]
+    mv2 = r2[m]
+    check4 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
+    )
+    return check4
+
+
+def _allclose_others(r1, r2):
+    return dpt.all(r1 == r2)
+
+
+def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False):
+    """allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False)
+
+    Returns True if two arrays are element-wise equal within tolerances.
+
+    The testing is based on the following elementwise comparison:
+
+           abs(a - b) <= max(atol, rtol * max(abs(a), abs(b)))
+    """
+    if not isinstance(a1, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpnp.tensor.usm_ndarray type, got {type(a1)}."
+        )
+    if not isinstance(a2, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected dpnp.tensor.usm_ndarray type, got {type(a2)}."
+        )
+    atol = float(atol)
+    rtol = float(rtol)
+    if atol < 0.0 or rtol < 0.0:
+        raise ValueError(
+            "Absolute and relative tolerances must be non-negative"
+        )
+    equal_nan = bool(equal_nan)
+    exec_q = dpt.get_execution_queue(tuple(a.sycl_queue for a in (a1, a2)))
+    if exec_q is None:
+        raise dpt.ExecutionPlacementError(
+            "Execution placement can not be unambiguously inferred "
+            "from input arguments."
+        )
+    res_sh = _broadcast_shape_impl([a1.shape, a2.shape])
+    b1 = a1
+    b2 = a2
+    if b1.dtype == b2.dtype:
+        res_dt = b1.dtype
+    else:
+        res_dt = np.promote_types(b1.dtype, b2.dtype)
+        res_dt = _to_device_supported_dtype(res_dt, exec_q.sycl_device)
+        b1 = dpt.astype(b1, res_dt)
+        b2 = dpt.astype(b2, res_dt)
+
+    b1 = dpt.broadcast_to(b1, res_sh)
+    b2 = dpt.broadcast_to(b2, res_sh)
+
+    k = b1.dtype.kind
+    if k == "c":
+        return _allclose_complex_fp(b1, b2, atol, rtol, equal_nan)
+    elif k == "f":
+        return _allclose_real_fp(b1, b2, atol, rtol, equal_nan)
+    else:
+        return _allclose_others(b1, b2)
diff --git a/dpnp/tensor/_type_utils.py b/dpnp/tensor/_type_utils.py
new file mode 100644
index 000000000000..b03ca1e1c79d
--- /dev/null
+++ b/dpnp/tensor/_type_utils.py
@@ -0,0 +1,1004 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from __future__ import annotations
+
+import numpy as np
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+
+
+def _all_data_types(_fp16, _fp64):
+    _non_fp_types = [
+        dpt.bool,
+        dpt.int8,
+        dpt.uint8,
+        dpt.int16,
+        dpt.uint16,
+        dpt.int32,
+        dpt.uint32,
+        dpt.int64,
+        dpt.uint64,
+    ]
+    if _fp64:
+        if _fp16:
+            return _non_fp_types + [
+                dpt.float16,
+                dpt.float32,
+                dpt.float64,
+                dpt.complex64,
+                dpt.complex128,
+            ]
+        else:
+            return _non_fp_types + [
+                dpt.float32,
+                dpt.float64,
+                dpt.complex64,
+                dpt.complex128,
+            ]
+    else:
+        if _fp16:
+            return _non_fp_types + [
+                dpt.float16,
+                dpt.float32,
+                dpt.complex64,
+            ]
+        else:
+            return _non_fp_types + [
+                dpt.float32,
+                dpt.complex64,
+            ]
+
+
+def _acceptance_fn_default_binary(
+    arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev
+):
+    return True
+
+
+def _acceptance_fn_default_unary(arg_dtype, ret_buf_dt, res_dt, sycl_dev):
+    return True
+
+
+def _acceptance_fn_divide(
+    arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev
+):
+    # both are being promoted, if the kind of result is
+    # different than the kind of original input dtypes,
+    # we use default dtype for the resulting kind.
+    # This covers, e.g. (array_dtype_i1 / array_dtype_u1)
+    # result of which in divide is double (in NumPy), but
+    # regular type promotion rules peg at float16
+    if (ret_buf1_dt.kind != arg1_dtype.kind) and (
+        ret_buf2_dt.kind != arg2_dtype.kind
+    ):
+        default_dt = _get_device_default_dtype(res_dt.kind, sycl_dev)
+        if res_dt == default_dt:
+            return True
+        else:
+            return False
+    else:
+        return True
+
+
+def _acceptance_fn_negative(arg_dtype, buf_dt, res_dt, sycl_dev):
+    # negative is not defined for boolean data type
+    if arg_dtype.char == "?":
+        raise ValueError(
+            "The `negative` function, the `-` operator, is not supported "
+            "for inputs of data type bool, use the `~` operator or the "
+            "`logical_not` function instead"
+        )
+    else:
+        return True
+
+
+def _acceptance_fn_reciprocal(arg_dtype, buf_dt, res_dt, sycl_dev):
+    # if the kind of result is different from the kind of input, we use the
+    # default floating-point dtype for the resulting kind. This guarantees
+    # alignment of reciprocal and divide output types.
+    if buf_dt.kind != arg_dtype.kind:
+        default_dt = _get_device_default_dtype(res_dt.kind, sycl_dev)
+        if res_dt == default_dt:
+            return True
+        else:
+            return False
+    else:
+        return True
+
+
+def _acceptance_fn_round(arg_dtype, buf_dt, res_dt, sycl_dev):
+    # for boolean input, prefer floating-point output over integral
+    if arg_dtype.kind == "b" and res_dt.kind != "f":
+        return False
+    return True
+
+
+def _acceptance_fn_subtract(
+    arg1_dtype, arg2_dtype, buf1_dt, buf2_dt, res_dt, sycl_dev
+):
+    # subtract is not defined for boolean data type
+    if arg1_dtype.char == "?" and arg2_dtype.char == "?":
+        raise ValueError(
+            "The `subtract` function, the `-` operator, is not supported "
+            "for inputs of data type bool, use the `^` operator,  the "
+            "`bitwise_xor`, or the `logical_xor` function instead"
+        )
+    else:
+        return True
+
+
+def _can_cast(
+    from_: dpt.dtype, to_: dpt.dtype, _fp16: bool, _fp64: bool, casting="safe"
+) -> bool:
+    """
+    Can `from_` be cast to `to_` safely on a device with
+    fp16 and fp64 aspects as given?
+    """
+    if not _dtype_supported_by_device_impl(to_, _fp16, _fp64):
+        return False
+    can_cast_v = np.can_cast(from_, to_, casting=casting)  # ask NumPy
+    if _fp16 and _fp64:
+        return can_cast_v
+    if not can_cast_v:
+        if (
+            from_.kind in "biu"
+            and to_.kind in "fc"
+            and _is_maximal_inexact_type(to_, _fp16, _fp64)
+        ):
+            return True
+
+    return can_cast_v
+
+
+def _dtype_supported_by_device_impl(
+    dt: dpt.dtype, has_fp16: bool, has_fp64: bool
+) -> bool:
+    if has_fp64:
+        if not has_fp16:
+            if dt is dpt.float16:
+                return False
+    else:
+        if dt is dpt.float64:
+            return False
+        elif dt is dpt.complex128:
+            return False
+        if not has_fp16 and dt is dpt.float16:
+            return False
+    return True
+
+
+def _find_buf_dtype(arg_dtype, query_fn, sycl_dev, acceptance_fn):
+    res_dt = query_fn(arg_dtype)
+    if res_dt:
+        return None, res_dt
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    all_dts = _all_data_types(_fp16, _fp64)
+    for buf_dt in all_dts:
+        if _can_cast(arg_dtype, buf_dt, _fp16, _fp64):
+            res_dt = query_fn(buf_dt)
+            if res_dt:
+                acceptable = acceptance_fn(arg_dtype, buf_dt, res_dt, sycl_dev)
+                if acceptable:
+                    return buf_dt, res_dt
+                else:
+                    continue
+
+    return None, None
+
+
+def _find_buf_dtype2(arg1_dtype, arg2_dtype, query_fn, sycl_dev, acceptance_fn):
+    res_dt = query_fn(arg1_dtype, arg2_dtype)
+    if res_dt:
+        return None, None, res_dt
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    all_dts = _all_data_types(_fp16, _fp64)
+    for buf1_dt in all_dts:
+        for buf2_dt in all_dts:
+            if _can_cast(arg1_dtype, buf1_dt, _fp16, _fp64) and _can_cast(
+                arg2_dtype, buf2_dt, _fp16, _fp64
+            ):
+                res_dt = query_fn(buf1_dt, buf2_dt)
+                if res_dt:
+                    ret_buf1_dt = None if buf1_dt == arg1_dtype else buf1_dt
+                    ret_buf2_dt = None if buf2_dt == arg2_dtype else buf2_dt
+                    if ret_buf1_dt is None or ret_buf2_dt is None:
+                        return ret_buf1_dt, ret_buf2_dt, res_dt
+                    else:
+                        acceptable = acceptance_fn(
+                            arg1_dtype,
+                            arg2_dtype,
+                            ret_buf1_dt,
+                            ret_buf2_dt,
+                            res_dt,
+                            sycl_dev,
+                        )
+                        if acceptable:
+                            return ret_buf1_dt, ret_buf2_dt, res_dt
+                        else:
+                            continue
+
+    return None, None, None
+
+
+def _find_buf_dtype_in_place_op(arg1_dtype, arg2_dtype, query_fn, sycl_dev):
+    res_dt = query_fn(arg1_dtype, arg2_dtype)
+    if res_dt:
+        return None, res_dt
+
+    _fp16 = sycl_dev.has_aspect_fp16
+    _fp64 = sycl_dev.has_aspect_fp64
+    if _can_cast(arg2_dtype, arg1_dtype, _fp16, _fp64, casting="same_kind"):
+        res_dt = query_fn(arg1_dtype, arg1_dtype)
+        if res_dt:
+            return arg1_dtype, res_dt
+
+    return None, None
+
+
+def _get_device_default_dtype(dt_kind, sycl_dev):
+    if dt_kind == "b":
+        return dpt.dtype(ti.default_device_bool_type(sycl_dev))
+    elif dt_kind == "i":
+        return dpt.dtype(ti.default_device_int_type(sycl_dev))
+    elif dt_kind == "u":
+        return dpt.dtype(ti.default_device_uint_type(sycl_dev))
+    elif dt_kind == "f":
+        return dpt.dtype(ti.default_device_fp_type(sycl_dev))
+    elif dt_kind == "c":
+        return dpt.dtype(ti.default_device_complex_type(sycl_dev))
+    raise RuntimeError
+
+
+def _is_maximal_inexact_type(dt: dpt.dtype, _fp16: bool, _fp64: bool):
+    """
+    Return True if data type `dt` is the
+    maximal size inexact data type
+    """
+    if _fp64:
+        return dt in [dpt.float64, dpt.complex128]
+    return dt in [dpt.float32, dpt.complex64]
+
+
+def _to_device_supported_dtype(dt, dev):
+    has_fp16 = dev.has_aspect_fp16
+    has_fp64 = dev.has_aspect_fp64
+
+    return _to_device_supported_dtype_impl(dt, has_fp16, has_fp64)
+
+
+def _to_device_supported_dtype_impl(dt, has_fp16, has_fp64):
+    if has_fp64:
+        if not has_fp16:
+            if dt is dpt.float16:
+                return dpt.float32
+    else:
+        if dt is dpt.float64:
+            return dpt.float32
+        elif dt is dpt.complex128:
+            return dpt.complex64
+        if not has_fp16 and dt is dpt.float16:
+            return dpt.float32
+    return dt
+
+
+class WeakBooleanType:
+    """Python type representing type of Python boolean objects"""
+
+    def __init__(self, o):
+        self.o_ = o
+
+    def get(self):
+        return self.o_
+
+
+class WeakIntegralType:
+    """Python type representing type of Python integral objects"""
+
+    def __init__(self, o):
+        self.o_ = o
+
+    def get(self):
+        return self.o_
+
+
+class WeakFloatingType:
+    """Python type representing type of Python floating point objects"""
+
+    def __init__(self, o):
+        self.o_ = o
+
+    def get(self):
+        return self.o_
+
+
+class WeakComplexType:
+    """Python type representing type of Python complex floating point objects"""
+
+    def __init__(self, o):
+        self.o_ = o
+
+    def get(self):
+        return self.o_
+
+
+def _weak_type_num_kind(o):
+    _map = {"?": 0, "i": 1, "f": 2, "c": 3}
+    if isinstance(o, WeakBooleanType):
+        return _map["?"]
+    if isinstance(o, WeakIntegralType):
+        return _map["i"]
+    if isinstance(o, WeakFloatingType):
+        return _map["f"]
+    if isinstance(o, WeakComplexType):
+        return _map["c"]
+    raise TypeError(
+        f"Unexpected type {o} while expecting "
+        "`WeakBooleanType`, `WeakIntegralType`,"
+        "`WeakFloatingType`, or `WeakComplexType`."
+    )
+
+
+def _strong_dtype_num_kind(o):
+    _map = {"b": 0, "i": 1, "u": 1, "f": 2, "c": 3}
+    if not isinstance(o, dpt.dtype):
+        raise TypeError
+    k = o.kind
+    if k in _map:
+        return _map[k]
+    raise ValueError(f"Unrecognized kind {k} for dtype {o}")
+
+
+def _is_weak_dtype(dtype):
+    return isinstance(
+        dtype,
+        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
+    )
+
+
+def _resolve_weak_types(o1_dtype, o2_dtype, dev):
+    """Resolves weak data type per NEP-0050"""
+    if _is_weak_dtype(o1_dtype):
+        if _is_weak_dtype(o2_dtype):
+            raise ValueError
+        o1_kind_num = _weak_type_num_kind(o1_dtype)
+        o2_kind_num = _strong_dtype_num_kind(o2_dtype)
+        if o1_kind_num > o2_kind_num:
+            if isinstance(o1_dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype
+            if isinstance(o1_dtype, WeakComplexType):
+                if o2_dtype is dpt.float16 or o2_dtype is dpt.float32:
+                    return dpt.complex64, o2_dtype
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    o2_dtype,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), o2_dtype
+        else:
+            return o2_dtype, o2_dtype
+    elif _is_weak_dtype(o2_dtype):
+        o1_kind_num = _strong_dtype_num_kind(o1_dtype)
+        o2_kind_num = _weak_type_num_kind(o2_dtype)
+        if o2_kind_num > o1_kind_num:
+            if isinstance(o2_dtype, WeakIntegralType):
+                return o1_dtype, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(o2_dtype, WeakComplexType):
+                if o1_dtype is dpt.float16 or o1_dtype is dpt.float32:
+                    return o1_dtype, dpt.complex64
+                return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev)
+            return (
+                o1_dtype,
+                _to_device_supported_dtype(dpt.float64, dev),
+            )
+        else:
+            return o1_dtype, o1_dtype
+    else:
+        return o1_dtype, o2_dtype
+
+
+def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev):
+    """
+    Resolves weak data type per NEP-0050 for comparisons and
+    divide, where result type is known and special behavior
+    is needed to handle mixed integer kinds and Python integers
+    without overflow
+    """
+    if _is_weak_dtype(o1_dtype):
+        if _is_weak_dtype(o2_dtype):
+            raise ValueError
+        o1_kind_num = _weak_type_num_kind(o1_dtype)
+        o2_kind_num = _strong_dtype_num_kind(o2_dtype)
+        if o1_kind_num > o2_kind_num:
+            if isinstance(o1_dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype
+            if isinstance(o1_dtype, WeakComplexType):
+                if o2_dtype is dpt.float16 or o2_dtype is dpt.float32:
+                    return dpt.complex64, o2_dtype
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    o2_dtype,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), o2_dtype
+        else:
+            if o1_kind_num == o2_kind_num and isinstance(
+                o1_dtype, WeakIntegralType
+            ):
+                o1_val = o1_dtype.get()
+                o2_iinfo = dpt.iinfo(o2_dtype)
+                if (o1_val < o2_iinfo.min) or (o1_val > o2_iinfo.max):
+                    return dpt.dtype(np.min_scalar_type(o1_val)), o2_dtype
+            return o2_dtype, o2_dtype
+    elif _is_weak_dtype(o2_dtype):
+        o1_kind_num = _strong_dtype_num_kind(o1_dtype)
+        o2_kind_num = _weak_type_num_kind(o2_dtype)
+        if o2_kind_num > o1_kind_num:
+            if isinstance(o2_dtype, WeakIntegralType):
+                return o1_dtype, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(o2_dtype, WeakComplexType):
+                if o1_dtype is dpt.float16 or o1_dtype is dpt.float32:
+                    return o1_dtype, dpt.complex64
+                return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev)
+            return (
+                o1_dtype,
+                _to_device_supported_dtype(dpt.float64, dev),
+            )
+        else:
+            if o1_kind_num == o2_kind_num and isinstance(
+                o2_dtype, WeakIntegralType
+            ):
+                o2_val = o2_dtype.get()
+                o1_iinfo = dpt.iinfo(o1_dtype)
+                if (o2_val < o1_iinfo.min) or (o2_val > o1_iinfo.max):
+                    return o1_dtype, dpt.dtype(np.min_scalar_type(o2_val))
+            return o1_dtype, o1_dtype
+    else:
+        return o1_dtype, o2_dtype
+
+
+def _resolve_one_strong_two_weak_types(st_dtype, dtype1, dtype2, dev):
+    """
+    Resolves weak data types per NEP-0050,
+    where the second and third arguments are
+    permitted to be weak types
+    """
+    if _is_weak_dtype(st_dtype):
+        raise ValueError
+    if _is_weak_dtype(dtype1):
+        if _is_weak_dtype(dtype2):
+            kind_num1 = _weak_type_num_kind(dtype1)
+            kind_num2 = _weak_type_num_kind(dtype2)
+            st_kind_num = _strong_dtype_num_kind(st_dtype)
+
+            if kind_num1 > st_kind_num:
+                if isinstance(dtype1, WeakIntegralType):
+                    ret_dtype1 = dpt.dtype(ti.default_device_int_type(dev))
+                elif isinstance(dtype1, WeakComplexType):
+                    if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                        ret_dtype1 = dpt.complex64
+                    ret_dtype1 = _to_device_supported_dtype(dpt.complex128, dev)
+                else:
+                    ret_dtype1 = _to_device_supported_dtype(dpt.float64, dev)
+            else:
+                ret_dtype1 = st_dtype
+
+            if kind_num2 > st_kind_num:
+                if isinstance(dtype2, WeakIntegralType):
+                    ret_dtype2 = dpt.dtype(ti.default_device_int_type(dev))
+                elif isinstance(dtype2, WeakComplexType):
+                    if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                        ret_dtype2 = dpt.complex64
+                    ret_dtype2 = _to_device_supported_dtype(dpt.complex128, dev)
+                else:
+                    ret_dtype2 = _to_device_supported_dtype(dpt.float64, dev)
+            else:
+                ret_dtype2 = st_dtype
+
+            return ret_dtype1, ret_dtype2
+
+        max_dt_num_kind, max_dtype = max(
+            [
+                (_strong_dtype_num_kind(st_dtype), st_dtype),
+                (_strong_dtype_num_kind(dtype2), dtype2),
+            ]
+        )
+        dt1_kind_num = _weak_type_num_kind(dtype1)
+        if dt1_kind_num > max_dt_num_kind:
+            if isinstance(dtype1, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev)), dtype2
+            if isinstance(dtype1, WeakComplexType):
+                if max_dtype is dpt.float16 or max_dtype is dpt.float32:
+                    return dpt.complex64, dtype2
+                return (
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                    dtype2,
+                )
+            return _to_device_supported_dtype(dpt.float64, dev), dtype2
+        else:
+            return max_dtype, dtype2
+    elif _is_weak_dtype(dtype2):
+        max_dt_num_kind, max_dtype = max(
+            [
+                (_strong_dtype_num_kind(st_dtype), st_dtype),
+                (_strong_dtype_num_kind(dtype1), dtype1),
+            ]
+        )
+        dt2_kind_num = _weak_type_num_kind(dtype2)
+        if dt2_kind_num > max_dt_num_kind:
+            if isinstance(dtype2, WeakIntegralType):
+                return dtype1, dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(dtype2, WeakComplexType):
+                if max_dtype is dpt.float16 or max_dtype is dpt.float32:
+                    return dtype1, dpt.complex64
+                return (
+                    dtype1,
+                    _to_device_supported_dtype(dpt.complex128, dev),
+                )
+            return dtype1, _to_device_supported_dtype(dpt.float64, dev)
+        else:
+            return dtype1, max_dtype
+    else:
+        # both are strong dtypes
+        # return unmodified
+        return dtype1, dtype2
+
+
+def _resolve_one_strong_one_weak_types(st_dtype, dtype, dev):
+    """Resolves one weak data type with one strong data type per NEP-0050"""
+    if _is_weak_dtype(st_dtype):
+        raise ValueError
+    if _is_weak_dtype(dtype):
+        st_kind_num = _strong_dtype_num_kind(st_dtype)
+        kind_num = _weak_type_num_kind(dtype)
+        if kind_num > st_kind_num:
+            if isinstance(dtype, WeakIntegralType):
+                return dpt.dtype(ti.default_device_int_type(dev))
+            if isinstance(dtype, WeakComplexType):
+                if st_dtype is dpt.float16 or st_dtype is dpt.float32:
+                    return dpt.complex64
+                return _to_device_supported_dtype(dpt.complex128, dev)
+            return _to_device_supported_dtype(dpt.float64, dev)
+        else:
+            return st_dtype
+    else:
+        return dtype
+
+
+class finfo_object:
+    """
+    `numpy.finfo` subclass which returns Python floating-point scalars for
+    `eps`, `max`, `min`, and `smallest_normal` attributes.
+    """
+
+    def __init__(self, dtype):
+        _supported_dtype([dpt.dtype(dtype)])
+        self._finfo = np.finfo(dtype)
+
+    @property
+    def bits(self):
+        """Number of bits occupied by the real-valued floating-point data type."""
+        return int(self._finfo.bits)
+
+    @property
+    def smallest_normal(self):
+        """
+        Smallest positive real-valued floating-point number with full
+        precision.
+        """
+        return float(self._finfo.smallest_normal)
+
+    @property
+    def tiny(self):
+        """An alias for `smallest_normal`"""
+        return float(self._finfo.tiny)
+
+    @property
+    def eps(self):
+        """
+        Difference between 1.0 and the next smallest representable real-valued
+        floating-point number larger than 1.0 according to the IEEE-754
+        standard.
+        """
+        return float(self._finfo.eps)
+
+    @property
+    def epsneg(self):
+        """
+        Difference between 1.0 and the next smallest representable real-valued
+        floating-point number smaller than 1.0 according to the IEEE-754
+        standard.
+        """
+        return float(self._finfo.epsneg)
+
+    @property
+    def min(self):
+        """Smallest representable real-valued number."""
+        return float(self._finfo.min)
+
+    @property
+    def max(self):
+        """Largest representable real-valued number."""
+        return float(self._finfo.max)
+
+    @property
+    def resolution(self):
+        """The approximate decimal resolution of this type."""
+        return float(self._finfo.resolution)
+
+    @property
+    def precision(self):
+        """
+        The approximate number of decimal digits to which this kind of
+        floating point type is precise.
+        """
+        return float(self._finfo.precision)
+
+    @property
+    def dtype(self):
+        """
+        The dtype for which finfo returns information. For complex input, the
+        returned dtype is the associated floating point dtype for its real and
+        complex components.
+        """
+        return self._finfo.dtype
+
+    def __str__(self):
+        return self._finfo.__str__()
+
+    def __repr__(self):
+        return self._finfo.__repr__()
+
+
+def can_cast(from_, to, /, *, casting="safe") -> bool:
+    """can_cast(from, to, casting="safe")
+
+    Determines if one data type can be cast to another data type according \
+    to Type Promotion Rules.
+
+    Args:
+       from_ (Union[usm_ndarray, dtype]):
+           source data type. If `from_` is an array, a device-specific type
+           promotion rules apply.
+       to (dtype):
+           target data type
+       casting (Optional[str]):
+            controls what kind of data casting may occur.
+
+                * "no" means data types should not be cast at all.
+                * "safe" means only casts that preserve values are allowed.
+                * "same_kind" means only safe casts and casts within a kind,
+                  like `float64` to `float32`, are allowed.
+                * "unsafe" means any data conversion can be done.
+
+            Default: `"safe"`.
+
+    Returns:
+        bool:
+            Gives `True` if cast can occur according to the casting rule.
+
+    Device-specific type promotion rules take into account which data type are
+    and are not supported by a specific device.
+    """
+    if isinstance(to, dpt.usm_ndarray):
+        raise TypeError(f"Expected `dpt.dtype` type, got {type(to)}.")
+
+    dtype_to = dpt.dtype(to)
+    _supported_dtype([dtype_to])
+
+    if isinstance(from_, dpt.usm_ndarray):
+        dtype_from = from_.dtype
+        return _can_cast(
+            dtype_from,
+            dtype_to,
+            from_.sycl_device.has_aspect_fp16,
+            from_.sycl_device.has_aspect_fp64,
+            casting=casting,
+        )
+    else:
+        dtype_from = dpt.dtype(from_)
+        _supported_dtype([dtype_from])
+        # query casting as if all dtypes are supported
+        return _can_cast(dtype_from, dtype_to, True, True, casting=casting)
+
+
+def result_type(*arrays_and_dtypes):
+    """
+    result_type(*arrays_and_dtypes)
+
+    Returns the dtype that results from applying the Type Promotion Rules to \
+        the arguments.
+
+    Args:
+        arrays_and_dtypes (Union[usm_ndarray, dtype]):
+            An arbitrary length sequence of usm_ndarray objects or dtypes.
+
+    Returns:
+        dtype:
+            The dtype resulting from an operation involving the
+            input arrays and dtypes.
+    """
+    dtypes = []
+    devices = []
+    weak_dtypes = []
+    for arg_i in arrays_and_dtypes:
+        if isinstance(arg_i, dpt.usm_ndarray):
+            devices.append(arg_i.sycl_device)
+            dtypes.append(arg_i.dtype)
+        elif isinstance(arg_i, int):
+            weak_dtypes.append(WeakIntegralType(arg_i))
+        elif isinstance(arg_i, float):
+            weak_dtypes.append(WeakFloatingType(arg_i))
+        elif isinstance(arg_i, complex):
+            weak_dtypes.append(WeakComplexType(arg_i))
+        elif isinstance(arg_i, bool):
+            weak_dtypes.append(WeakBooleanType(arg_i))
+        else:
+            dt = dpt.dtype(arg_i)
+            _supported_dtype([dt])
+            dtypes.append(dt)
+
+    has_fp16 = True
+    has_fp64 = True
+    target_dev = None
+    if devices:
+        inspected = False
+        for d in devices:
+            if inspected:
+                unsame_fp16_support = d.has_aspect_fp16 != has_fp16
+                unsame_fp64_support = d.has_aspect_fp64 != has_fp64
+                if unsame_fp16_support or unsame_fp64_support:
+                    raise ValueError(
+                        "Input arrays reside on devices "
+                        "with different device supports; "
+                        "unable to determine which "
+                        "device-specific type promotion rules "
+                        "to use."
+                    )
+            else:
+                has_fp16 = d.has_aspect_fp16
+                has_fp64 = d.has_aspect_fp64
+                target_dev = d
+                inspected = True
+
+    if not dtypes and weak_dtypes:
+        dtypes.append(weak_dtypes[0].get())
+
+    if not (has_fp16 and has_fp64):
+        for dt in dtypes:
+            if not _dtype_supported_by_device_impl(dt, has_fp16, has_fp64):
+                raise ValueError(
+                    f"Argument {dt} is not supported by the device"
+                )
+        res_dt = np.result_type(*dtypes)
+        res_dt = _to_device_supported_dtype_impl(res_dt, has_fp16, has_fp64)
+        for wdt in weak_dtypes:
+            pair = _resolve_weak_types(wdt, res_dt, target_dev)
+            res_dt = np.result_type(*pair)
+            res_dt = _to_device_supported_dtype_impl(res_dt, has_fp16, has_fp64)
+    else:
+        res_dt = np.result_type(*dtypes)
+        if weak_dtypes:
+            weak_dt_obj = [wdt.get() for wdt in weak_dtypes]
+            res_dt = np.result_type(res_dt, *weak_dt_obj)
+
+    return res_dt
+
+
+def iinfo(dtype, /):
+    """iinfo(dtype)
+
+    Returns machine limits for integer data types.
+
+    Args:
+        dtype (dtype, usm_ndarray):
+            integer dtype or
+            an array with integer dtype.
+
+    Returns:
+        iinfo_object:
+            An object with the following attributes:
+
+            * bits: int
+                number of bits occupied by the data type
+            * max: int
+                largest representable number.
+            * min: int
+                smallest representable number.
+            * dtype: dtype
+                integer data type.
+    """
+    if isinstance(dtype, dpt.usm_ndarray):
+        dtype = dtype.dtype
+    _supported_dtype([dpt.dtype(dtype)])
+    return np.iinfo(dtype)
+
+
+def finfo(dtype, /):
+    """finfo(type)
+
+    Returns machine limits for floating-point data types.
+
+    Args:
+        dtype (dtype, usm_ndarray): floating-point dtype or
+            an array with floating point data type.
+            If complex, the information is about its component
+            data type.
+
+    Returns:
+        finfo_object:
+            an object have the following attributes:
+
+                * bits: int
+                    number of bits occupied by dtype.
+                * eps: float
+                    difference between 1.0 and the next smallest representable
+                    real-valued floating-point number larger than 1.0 according
+                    to the IEEE-754 standard.
+                * max: float
+                    largest representable real-valued number.
+                * min: float
+                    smallest representable real-valued number.
+                * smallest_normal: float
+                    smallest positive real-valued floating-point number with
+                    full precision.
+                * dtype: dtype
+                    real-valued floating-point data type.
+
+    """
+    if isinstance(dtype, dpt.usm_ndarray):
+        dtype = dtype.dtype
+    _supported_dtype([dpt.dtype(dtype)])
+    return finfo_object(dtype)
+
+
+def _supported_dtype(dtypes):
+    for dtype in dtypes:
+        if dtype.char not in "?bBhHiIlLqQefdFD":
+            raise ValueError(f"Dpctl doesn't support dtype {dtype}.")
+    return True
+
+
+def isdtype(dtype, kind):
+    """isdtype(dtype, kind)
+
+    Returns a boolean indicating whether a provided `dtype` is
+    of a specified data type `kind`.
+
+    See [array API](array_api) for more information.
+
+    [array_api]: https://data-apis.org/array-api/latest/
+    """
+
+    if not isinstance(dtype, np.dtype):
+        raise TypeError(f"Expected instance of `dpt.dtype`, got {dtype}")
+
+    if isinstance(kind, np.dtype):
+        return dtype == kind
+
+    elif isinstance(kind, str):
+        if kind == "bool":
+            return dtype == np.dtype("bool")
+        elif kind == "signed integer":
+            return dtype.kind == "i"
+        elif kind == "unsigned integer":
+            return dtype.kind == "u"
+        elif kind == "integral":
+            return dtype.kind in "iu"
+        elif kind == "real floating":
+            return dtype.kind == "f"
+        elif kind == "complex floating":
+            return dtype.kind == "c"
+        elif kind == "numeric":
+            return dtype.kind in "iufc"
+        else:
+            raise ValueError(f"Unrecognized data type kind: {kind}")
+
+    elif isinstance(kind, tuple):
+        return any(isdtype(dtype, k) for k in kind)
+
+    else:
+        raise TypeError(f"Unsupported data type kind: {kind}")
+
+
+def _default_accumulation_dtype(inp_dt, q):
+    """Gives default output data type for given input data
+    type `inp_dt` when accumulation is performed on queue `q`
+    """
+    inp_kind = inp_dt.kind
+    if inp_kind in "bi":
+        res_dt = dpt.dtype(ti.default_device_int_type(q))
+        if inp_dt.itemsize > res_dt.itemsize:
+            res_dt = inp_dt
+    elif inp_kind in "u":
+        res_dt = dpt.dtype(ti.default_device_uint_type(q))
+        res_ii = dpt.iinfo(res_dt)
+        inp_ii = dpt.iinfo(inp_dt)
+        if inp_ii.min >= res_ii.min and inp_ii.max <= res_ii.max:
+            pass
+        else:
+            res_dt = inp_dt
+    elif inp_kind in "fc":
+        res_dt = inp_dt
+
+    return res_dt
+
+
+def _default_accumulation_dtype_fp_types(inp_dt, q):
+    """Gives default output data type for given input data
+    type `inp_dt` when accumulation is performed on queue `q`
+    and the accumulation supports only floating-point data types
+    """
+    inp_kind = inp_dt.kind
+    if inp_kind in "biu":
+        res_dt = dpt.dtype(ti.default_device_fp_type(q))
+        can_cast_v = dpt.can_cast(inp_dt, res_dt)
+        if not can_cast_v:
+            _fp64 = q.sycl_device.has_aspect_fp64
+            res_dt = dpt.float64 if _fp64 else dpt.float32
+    elif inp_kind in "f":
+        res_dt = inp_dt
+    elif inp_kind in "c":
+        raise ValueError("function not defined for complex types")
+
+    return res_dt
+
+
+__all__ = [
+    "_find_buf_dtype",
+    "_find_buf_dtype2",
+    "_to_device_supported_dtype",
+    "_acceptance_fn_default_unary",
+    "_acceptance_fn_round",
+    "_acceptance_fn_reciprocal",
+    "_acceptance_fn_default_binary",
+    "_acceptance_fn_divide",
+    "_acceptance_fn_negative",
+    "_acceptance_fn_subtract",
+    "_resolve_one_strong_one_weak_types",
+    "_resolve_one_strong_two_weak_types",
+    "_resolve_weak_types",
+    "_resolve_weak_types_all_py_ints",
+    "_weak_type_num_kind",
+    "_strong_dtype_num_kind",
+    "can_cast",
+    "finfo",
+    "iinfo",
+    "isdtype",
+    "result_type",
+    "WeakBooleanType",
+    "WeakIntegralType",
+    "WeakFloatingType",
+    "WeakComplexType",
+    "_default_accumulation_dtype",
+    "_default_accumulation_dtype_fp_types",
+    "_find_buf_dtype_in_place_op",
+]
diff --git a/dpnp/tensor/_types.pxi b/dpnp/tensor/_types.pxi
new file mode 100644
index 000000000000..090750658f4b
--- /dev/null
+++ b/dpnp/tensor/_types.pxi
@@ -0,0 +1,169 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# these typenum values are aligned to values in NumPy
+cdef:
+    int UAR_BOOL = 0  # pragma: no cover
+    int UAR_BYTE = 1  # pragma: no cover
+    int UAR_UBYTE = 2  # pragma: no cover
+    int UAR_SHORT = 3  # pragma: no cover
+    int UAR_USHORT = 4  # pragma: no cover
+    int UAR_INT = 5  # pragma: no cover
+    int UAR_UINT = 6  # pragma: no cover
+    int UAR_LONG = 7  # pragma: no cover
+    int UAR_ULONG = 8  # pragma: no cover
+    int UAR_LONGLONG = 9  # pragma: no cover
+    int UAR_ULONGLONG = 10  # pragma: no cover
+    int UAR_FLOAT = 11  # pragma: no cover
+    int UAR_DOUBLE = 12  # pragma: no cover
+    int UAR_CFLOAT = 14  # pragma: no cover
+    int UAR_CDOUBLE = 15  # pragma: no cover
+    int UAR_TYPE_SENTINEL = 17  # pragma: no cover
+    int UAR_HALF = 23  # pragma: no cover
+
+cdef int type_bytesize(int typenum):
+    """
+    NPY_BOOL=0         : 1
+    NPY_BYTE=1         : 1
+    NPY_UBYTE=2        : 1
+    NPY_SHORT=3        : 2
+    NPY_USHORT=4       : 2
+    NPY_INT=5          : sizeof(int)
+    NPY_UINT=6         : sizeof(unsigned int)
+    NPY_LONG=7         : sizeof(long)
+    NPY_ULONG=8        : sizeof(unsigned long)
+    NPY_LONGLONG=9     : 8
+    NPY_ULONGLONG=10   : 8
+    NPY_FLOAT=11       : 4
+    NPY_DOUBLE=12      : 8
+    NPY_LONGDOUBLE=13  : N/A
+    NPY_CFLOAT=14      : 8
+    NPY_CDOUBLE=15     : 16
+    NPY_CLONGDOUBLE=16 : N/A
+    NPY_HALF=23        : 2
+    """
+    cdef int *type_to_bytesize = [
+        1,
+        sizeof(char),
+        sizeof(unsigned char),
+        sizeof(short),
+        sizeof(unsigned short),
+        sizeof(int),
+        sizeof(unsigned int),
+        sizeof(long),
+        sizeof(unsigned long),
+        sizeof(long long),
+        sizeof(unsigned long long),
+        sizeof(float),
+        sizeof(double), -1,
+        sizeof(float complex),
+        sizeof(double complex), -1]
+
+    if typenum < 0:  # pragma: no cover
+        return -1
+    if typenum > 16:
+        if typenum == 23:
+            return 2
+        return -1
+
+    return type_to_bytesize[typenum]
+
+
+cdef str _make_typestr(int typenum):
+    """
+    Make typestring from type number
+    """
+    cdef type_to_str = ["|b", "|i", "|u", "|i", "|u",
+                        "|i", "|u", "|i", "|u", "|i", "|u",
+                        "|f", "|f", "", "|c", "|c", ""]
+
+    if (typenum < 0):  # pragma: no cover
+        return ""
+    if (typenum > 16):
+        if (typenum == 23):
+            return "|f2"
+        return ""  # pragma: no cover
+
+    return type_to_str[typenum] + str(type_bytesize(typenum))
+
+
+cdef int typenum_from_format(str s):
+    """
+    Internal utility to convert string describing type format
+
+    Format is [<|=>][biufc]#
+    Shortcuts for formats are i, u, d, D
+    """
+    if not s:
+        return -1
+    try:
+        dt = np.dtype(s)
+    except Exception:
+        return -1
+    if (dt.byteorder == ">"):
+        return -2
+    return dt.num
+
+
+cdef int descr_to_typenum(object dtype):
+    """
+    Returns typenum for argumentd dtype that has attribute descr,
+    assumed numpy.dtype
+    """
+    obj = getattr(dtype, "descr")
+    if (not isinstance(obj, list) or len(obj) != 1):
+        return -1    # token for ValueError
+    obj = obj[0]
+    if (
+        not isinstance(obj, tuple) or len(obj) != 2 or obj[0]
+    ):  # pragma: no cover
+        return -1
+    obj = obj[1]
+    if not isinstance(obj, str):  # pragma: no cover
+        return -1
+    return typenum_from_format(obj)
+
+
+cdef int dtype_to_typenum(dtype):
+    if isinstance(dtype, str):
+        return typenum_from_format(dtype)
+    elif isinstance(dtype, bytes):
+        return typenum_from_format(dtype.decode("UTF-8"))
+    elif hasattr(dtype, "descr"):
+        return descr_to_typenum(dtype)
+    else:
+        try:
+            dt = np.dtype(dtype)
+        except TypeError:
+            return -3
+        except Exception:  # pragma: no cover
+            return -1
+        if hasattr(dt, "descr"):
+            return descr_to_typenum(dt)
+        else:  # pragma: no cover
+            return -3  # token for TypeError
diff --git a/dpnp/tensor/_usmarray.pxd b/dpnp/tensor/_usmarray.pxd
new file mode 100644
index 000000000000..ccb8f4c796b7
--- /dev/null
+++ b/dpnp/tensor/_usmarray.pxd
@@ -0,0 +1,88 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+
+cimport dpctl
+
+
+cdef public api int USM_ARRAY_C_CONTIGUOUS
+cdef public api int USM_ARRAY_F_CONTIGUOUS
+cdef public api int USM_ARRAY_WRITABLE
+
+cdef public api int UAR_BOOL
+cdef public api int UAR_BYTE
+cdef public api int UAR_UBYTE
+cdef public api int UAR_SHORT
+cdef public api int UAR_USHORT
+cdef public api int UAR_INT
+cdef public api int UAR_UINT
+cdef public api int UAR_LONG
+cdef public api int UAR_ULONG
+cdef public api int UAR_LONGLONG
+cdef public api int UAR_ULONGLONG
+cdef public api int UAR_FLOAT
+cdef public api int UAR_DOUBLE
+cdef public api int UAR_CFLOAT
+cdef public api int UAR_CDOUBLE
+cdef public api int UAR_TYPE_SENTINEL
+cdef public api int UAR_HALF
+
+
+cdef api class usm_ndarray [object PyUSMArrayObject, type PyUSMArrayType]:
+    # data fields
+    cdef char* data_
+    cdef int nd_
+    cdef Py_ssize_t *shape_
+    cdef Py_ssize_t *strides_
+    cdef int typenum_
+    cdef int flags_
+    cdef object base_
+    cdef object array_namespace_
+    # make usm_ndarray weak-referenceable
+    cdef object __weakref__
+
+    cdef void _reset(usm_ndarray self)
+    cdef void _cleanup(usm_ndarray self)
+    cdef Py_ssize_t get_offset(usm_ndarray self) except *
+
+    cdef char* get_data(self)
+    cdef int get_ndim(self)
+    cdef Py_ssize_t * get_shape(self)
+    cdef Py_ssize_t * get_strides(self)
+    cdef int get_typenum(self)
+    cdef int get_itemsize(self)
+    cdef int get_flags(self)
+    cdef object get_base(self)
+    cdef dpctl.DPCTLSyclQueueRef get_queue_ref(self) except *
+    cdef dpctl.SyclQueue get_sycl_queue(self)
+
+    cdef _set_writable_flag(self, int)
+
+    cdef __cythonbufferdefaults__ = {"mode": "strided"}
diff --git a/dpnp/tensor/_usmarray.pyx b/dpnp/tensor/_usmarray.pyx
new file mode 100644
index 000000000000..c696056d53c2
--- /dev/null
+++ b/dpnp/tensor/_usmarray.pyx
@@ -0,0 +1,1745 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+import dpctl
+import dpctl.memory as dpmem
+import numpy as np
+
+from dpctl._backend cimport DPCTLSyclUSMRef
+from dpctl._sycl_device_factory cimport _cached_default_device
+
+import dpnp
+
+from ._data_types import bool as dpt_bool
+from ._device import Device
+from ._print import usm_ndarray_repr, usm_ndarray_str
+
+cimport dpctl as c_dpctl
+cimport dpctl.memory as c_dpmem
+from cpython.mem cimport PyMem_Free
+from cpython.tuple cimport PyTuple_New, PyTuple_SetItem
+
+from . cimport _dlpack as c_dlpack
+
+from enum import IntEnum
+
+from . import _flags
+from ._dlpack import get_build_dlpack_version
+from ._tensor_impl import default_device_fp_type
+
+include "_stride_utils.pxi"
+include "_types.pxi"
+include "_slicing.pxi"
+
+
+class DLDeviceType(IntEnum):
+    """
+    An :class:`enum.IntEnum` for the types of DLDevices supported by the DLPack
+    protocol.
+
+        ``kDLCPU``:
+            CPU (host) device
+        ``kDLCUDA``:
+            CUDA GPU device
+        ``kDLCUDAHost``:
+            Pinned CUDA CPU memory by cudaMallocHost
+        ``kDLOpenCL``:
+            OpenCL device
+        ``kDLVulkan``:
+            Vulkan buffer
+        ``kDLMetal``:
+            Metal for Apple GPU
+        ``kDLVPI``:
+            Verilog simulator buffer
+        ``kDLROCM``:
+            ROCm GPU device
+        ``kDLROCMHost``:
+            Pinned ROCm CPU memory allocated by hipMallocHost
+        ``kDLExtDev``:
+            Reserved extension device type used to test new devices
+        ``kDLCUDAManaged``:
+            CUDA managed/unified memory allocated by cudaMallocManaged
+        ``kDLOneAPI``:
+            Unified shared memory allocated on a oneAPI non-partitioned device
+        ``kDLWebGPU``:
+            Device support for WebGPU standard
+        ``kDLHexagon``:
+            Qualcomm Hexagon DSP
+        ``kDLMAIA``:
+            Microsoft MAIA device
+        ``kDLTrn``:
+            AWS Trainium device
+    """
+    kDLCPU = c_dlpack.device_CPU
+    kDLCUDA = c_dlpack.device_CUDA
+    kDLCUDAHost = c_dlpack.device_CUDAHost
+    kDLCUDAManaged = c_dlpack.device_CUDAManaged
+    kDLROCM = c_dlpack.device_DLROCM
+    kDLROCMHost = c_dlpack.device_ROCMHost
+    kDLOpenCL = c_dlpack.device_OpenCL
+    kDLVulkan = c_dlpack.device_Vulkan
+    kDLMetal = c_dlpack.device_Metal
+    kDLVPI = c_dlpack.device_VPI
+    kDLOneAPI = c_dlpack.device_OneAPI
+    kDLWebGPU = c_dlpack.device_WebGPU
+    kDLHexagon = c_dlpack.device_Hexagon
+    kDLMAIA = c_dlpack.device_MAIA
+    kDLTrn = c_dlpack.device_Trn
+
+
+cdef class InternalUSMArrayError(Exception):
+    """
+    An InternalUSMArrayError exception is raised when internal
+    inconsistency has been detected in :class:`.usm_ndarray`.
+    """
+    pass
+
+
+cdef object _as_zero_dim_ndarray(object usm_ary):
+    "Convert size-1 array to NumPy 0d array"
+    mem_view = dpmem.as_usm_memory(usm_ary)
+    usm_ary.sycl_queue.wait()
+    host_buf = mem_view.copy_to_host()
+    view = host_buf.view(usm_ary.dtype)
+    view.shape = tuple()
+    return view
+
+
+cdef inline void _check_0d_scalar_conversion(object usm_ary) except *:
+    "Raise TypeError if array cannot be converted to a Python scalar"
+    if (usm_ary.ndim != 0):
+        raise TypeError(
+            "only 0-dimensional arrays can be converted to Python scalars"
+        )
+
+
+cdef int _copy_writable(int lhs_flags, int rhs_flags):
+    "Copy the WRITABLE flag to lhs_flags from rhs_flags"
+    return (lhs_flags & ~USM_ARRAY_WRITABLE) | (rhs_flags & USM_ARRAY_WRITABLE)
+
+
+cdef bint _is_host_cpu(object dl_device):
+    "Check if dl_device denotes (kDLCPU, 0)"
+    cdef object dl_type
+    cdef object dl_id
+    cdef Py_ssize_t n_elems = -1
+
+    try:
+        n_elems = len(dl_device)
+    except TypeError:
+        pass
+
+    if n_elems != 2:
+        return False
+
+    dl_type = dl_device[0]
+    dl_id = dl_device[1]
+    if isinstance(dl_type, str):
+        return (dl_type == "kDLCPU" and dl_id == 0)
+
+    return (dl_type == DLDeviceType.kDLCPU) and (dl_id == 0)
+
+
+cdef void _validate_and_use_stream(
+    object stream, c_dpctl.SyclQueue self_queue
+) except *:
+    if (stream is None or stream == self_queue):
+        pass
+    else:
+        if not isinstance(stream, dpctl.SyclQueue):
+            raise TypeError(
+                "stream argument type was expected to be dpctl.SyclQueue,"
+                f" got {type(stream)} instead"
+            )
+        ev = self_queue.submit_barrier()
+        stream.submit_barrier(dependent_events=[ev])
+
+cdef class usm_ndarray:
+    """ usm_ndarray(shape, dtype=None, strides=None, buffer="device", \
+           offset=0, order="C", buffer_ctor_kwargs=dict(), \
+           array_namespace=None)
+
+    An array object represents a multidimensional tensor of numeric
+    elements stored in a USM allocation on a SYCL device.
+
+    Arg:
+        shape (int, tuple):
+            Shape of the array to be created.
+        dtype (str, dtype):
+            Array data type, i.e. the type of array elements.
+            If ``dtype`` has the value ``None``, it is determined by default
+            floating point type supported by target device.
+            The supported types are
+
+                ``bool``:
+                    boolean type
+                ``int8``, ``int16``, ``int32``, ``int64``:
+                    signed integer types
+                ``uint8``, ``uint16``, ``uint32``, ``uint64``:
+                    unsigned integer types
+                ``float16``:
+                    half-precision floating type,
+                    supported if target device's property
+                    ``has_aspect_fp16`` is ``True``
+                ``float32``, ``complex64``:
+                    single-precision real and complex floating types
+                ``float64``, ``complex128``:
+                    double-precision real and complex floating
+                    types, supported if target device's property
+                    ``has_aspect_fp64`` is ``True``.
+
+            Default: ``None``.
+        strides (tuple, optional):
+            Strides of the array to be created in elements.
+            If ``strides`` has the value ``None``, it is determined by the
+            ``shape`` of the array and the requested ``order``.
+            Default: ``None``.
+        buffer (str, object, optional):
+            A string corresponding to the type of USM allocation to make,
+            or a Python object representing a USM memory allocation, i.e.
+            :class:`dpctl.memory.MemoryUSMDevice`,
+            :class:`dpctl.memory.MemoryUSMShared`, or
+            :class:`dpctl.memory.MemoryUSMHost`. Recognized strings are
+            ``"device"``, ``"shared"``, or ``"host"``. Additional arguments to
+            the USM memory allocators can be passed in a dictionary specified
+            via ``buffer_ctor_kwrds`` keyword parameter.
+            Default: ``"device"``.
+        offset (int, optional):
+            Offset of the array element with all zero indexes relative to the
+            start of the provided `buffer` in elements. The argument is ignored
+            if the ``buffer`` value is a string and the memory is allocated by
+            the constructor. Default: ``0``.
+        order ({"C", "F"}, optional):
+            The memory layout of the array when constructing using a new
+            allocation. Value ``"C"`` corresponds to C-contiguous, or row-major
+            memory layout, while value ``"F"`` corresponds to F-contiguous, or
+            column-major layout. Default: ``"C"``.
+        buffer_ctor_kwargs (dict, optional):
+            Dictionary with keyword parameters to use when creating a new USM
+            memory allocation. See :class:`dpctl.memory.MemoryUSMShared` for
+            supported keyword arguments.
+        array_namespace (module, optional):
+            Array namespace module associated with this array.
+            Default: ``None``.
+
+    ``buffer`` can be ``"shared"``, ``"host"``, ``"device"`` to allocate
+    new device memory by calling respective constructor with
+    the specified ``buffer_ctor_kwrds``; ``buffer`` can be an
+    instance of :class:`dpctl.memory.MemoryUSMShared`,
+    :class:`dpctl.memory.MemoryUSMDevice`, or
+    :class:`dpctl.memory.MemoryUSMHost`; ``buffer`` can also be
+    another :class:`dpctl.tensor.usm_ndarray` instance, in which case its
+    underlying ``MemoryUSM*`` buffer is used.
+    """
+
+    cdef void _reset(usm_ndarray self):
+        """
+        Initializes member fields
+        """
+        self.base_ = None
+        self.array_namespace_ = None
+        self.nd_ = -1
+        self.data_ = <char *>0
+        self.shape_ = <Py_ssize_t *>0
+        self.strides_ = <Py_ssize_t *>0
+        self.flags_ = 0
+
+    cdef void _cleanup(usm_ndarray self):
+        if (self.shape_):
+            PyMem_Free(self.shape_)
+        if (self.strides_):
+            PyMem_Free(self.strides_)
+        self._reset()
+
+    def __cinit__(self, shape, dtype=None, strides=None, buffer="device",
+                  Py_ssize_t offset=0, order="C",
+                  buffer_ctor_kwargs=dict(),
+                  array_namespace=None):
+        """
+        strides and offset must be given in units of array elements.
+        buffer can be strings ('device'|'shared'|'host' to allocate new memory)
+        or ``dpctl.memory.MemoryUSM*`` buffers, or ``usm_ndarray`` instances.
+        """
+        cdef int nd = 0
+        cdef int typenum = 0
+        cdef int itemsize = 0
+        cdef int err = 0
+        cdef int contig_flag = 0
+        cdef int writable_flag = USM_ARRAY_WRITABLE
+        cdef Py_ssize_t *shape_ptr = NULL
+        cdef Py_ssize_t ary_nelems = 0
+        cdef Py_ssize_t ary_nbytes = 0
+        cdef Py_ssize_t *strides_ptr = NULL
+        cdef Py_ssize_t _offset = offset
+        cdef Py_ssize_t ary_min_displacement = 0
+        cdef Py_ssize_t ary_max_displacement = 0
+        cdef bint is_fp64 = False
+        cdef bint is_fp16 = False
+
+        self._reset()
+        if not isinstance(shape, (list, tuple)):
+            if hasattr(shape, "tolist"):
+                fn = getattr(shape, "tolist")
+                if callable(fn):
+                    shape = shape.tolist()
+            if not isinstance(shape, (list, tuple)):
+                try:
+                    <Py_ssize_t> shape
+                    shape = [shape, ]
+                except Exception as e:
+                    raise TypeError(
+                        "Argument shape must a non-negative integer, "
+                        "or a list/tuple of such integers."
+                    ) from e
+        nd = len(shape)
+        if dtype is None:
+            if isinstance(buffer, (dpmem._memory._Memory, usm_ndarray)):
+                q = buffer.sycl_queue
+            else:
+                q = buffer_ctor_kwargs.get("queue")
+            if q is not None:
+                dtype = default_device_fp_type(q)
+            else:
+                dev = _cached_default_device()
+                dtype = "f8" if dev.has_aspect_fp64 else "f4"
+        typenum = dtype_to_typenum(dtype)
+        if (typenum < 0):
+            if typenum == -2:
+                raise ValueError(
+                    "Data type '" + str(dtype) +
+                    "' can only have native byteorder."
+                )
+            elif typenum == -1:
+                raise ValueError(
+                    "Data type '" + str(dtype) + "' is not understood."
+                )
+            raise TypeError(
+                f"Expected string or a dtype object, got {type(dtype)}"
+            )
+        itemsize = type_bytesize(typenum)
+        if (itemsize < 1):
+            raise TypeError(
+                "dtype=" + np.dtype(dtype).name + " is not supported."
+            )
+        # allocate host C-arrays for shape, strides
+        err = _from_input_shape_strides(
+            nd, shape, strides, itemsize, <char> ord(order),
+            &shape_ptr, &strides_ptr, &ary_nelems,
+            &ary_min_displacement, &ary_max_displacement, &contig_flag
+        )
+        if (err):
+            self._cleanup()
+            if err == ERROR_MALLOC:
+                raise MemoryError("Memory allocation for shape/strides "
+                                  "array failed.")
+            elif err == ERROR_INCORRECT_ORDER:
+                raise ValueError(
+                    "Unsupported order='{}' given. "
+                    "Supported values are 'C' or 'F'.".format(order))
+            elif err == ERROR_UNEXPECTED_STRIDES:
+                raise ValueError(
+                    "strides={} is not understood".format(strides))
+            else:
+                raise InternalUSMArrayError(
+                    " .. while processing shape and strides.")
+        ary_nbytes = (ary_max_displacement -
+                      ary_min_displacement + 1) * itemsize
+        if isinstance(buffer, dpmem._memory._Memory):
+            _buffer = buffer
+        elif isinstance(buffer, (str, bytes)):
+            if isinstance(buffer, bytes):
+                buffer = buffer.decode("UTF-8")
+            _offset = -ary_min_displacement
+            if (buffer == "shared"):
+                _buffer = dpmem.MemoryUSMShared(ary_nbytes,
+                                                **buffer_ctor_kwargs)
+            elif (buffer == "device"):
+                _buffer = dpmem.MemoryUSMDevice(ary_nbytes,
+                                                **buffer_ctor_kwargs)
+            elif (buffer == "host"):
+                _buffer = dpmem.MemoryUSMHost(ary_nbytes,
+                                              **buffer_ctor_kwargs)
+            else:
+                self._cleanup()
+                raise ValueError(
+                    "buffer='{}' is not understood. "
+                    "Recognized values are 'device', 'shared',  'host', "
+                    "an instance of `MemoryUSM*` object, or a usm_ndarray"
+                    "".format(buffer)
+                )
+        elif isinstance(buffer, usm_ndarray):
+            if not buffer.flags.writable:
+                writable_flag = 0
+            _buffer = buffer.usm_data
+        else:
+            self._cleanup()
+            raise ValueError("buffer='{}' was not understood.".format(buffer))
+        if (shape_to_elem_count(nd, shape_ptr) > 0 and
+            (_offset + ary_min_displacement < 0 or
+             (_offset + ary_max_displacement + 1) * itemsize > _buffer.nbytes)):
+            self._cleanup()
+            raise ValueError(("buffer='{}' can not accommodate "
+                              "the requested array.").format(buffer))
+        is_fp64 = (typenum == UAR_DOUBLE or typenum == UAR_CDOUBLE)
+        is_fp16 = (typenum == UAR_HALF)
+        if (is_fp64 or is_fp16):
+            if (
+                (is_fp64 and not _buffer.sycl_device.has_aspect_fp64) or
+                (is_fp16 and not _buffer.sycl_device.has_aspect_fp16)
+            ):
+                raise ValueError(
+                    f"Device {_buffer.sycl_device.name} does"
+                    f" not support {dtype} natively."
+                )
+        self.base_ = _buffer
+        self.data_ = (<char *> (<size_t> _buffer._pointer)) + itemsize * _offset
+        self.shape_ = shape_ptr
+        self.strides_ = strides_ptr
+        self.typenum_ = typenum
+        self.flags_ = (contig_flag | writable_flag)
+        self.nd_ = nd
+        self.array_namespace_ = array_namespace
+
+    def __dealloc__(self):
+        self._cleanup()
+
+    @property
+    def _pointer(self):
+        """
+        Returns USM pointer to the start of array (element with zero
+        multi-index) encoded as integer.
+        """
+        return <size_t> self.get_data()
+
+    cdef Py_ssize_t get_offset(self) except *:
+        cdef char *mem_ptr = NULL
+        cdef char *ary_ptr = self.get_data()
+        mem_ptr = <char *>(<size_t> self.base_._pointer)
+        byte_offset = ary_ptr - mem_ptr
+        item_size = self.get_itemsize()
+        if (byte_offset % item_size):
+            raise InternalUSMArrayError(
+                "byte_offset is not a multiple of item_size.")
+        return byte_offset // item_size
+
+    @property
+    def _element_offset(self):
+        """Returns the offset of the zero-index element of the array, in
+        elements, relative to the start of memory allocation"""
+        return self.get_offset()
+
+    @property
+    def _byte_bounds(self):
+        """Returns a 2-tuple with pointers to the end-points of the array
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpnp import tensor
+
+                x = tensor.ones((3, 10, 7))
+                y = tensor.flip(x[:, 1::2], axis=1)
+
+                beg_p, end_p = y._byte_bounds
+                # Bytes taken to store this array
+                bytes_extent = end_p - beg_p
+
+                # C-contiguous copy is more compact
+                yc = tensor.copy(y, order="C")
+                beg_pc, end_pc = yc._byte_bounds
+                assert bytes_extent < end_pc - beg_pc
+        """
+        cdef Py_ssize_t min_disp = 0
+        cdef Py_ssize_t max_disp = 0
+        cdef Py_ssize_t step_ = 0
+        cdef Py_ssize_t dim_ = 0
+        cdef int it = 0
+        cdef Py_ssize_t _itemsize = self.get_itemsize()
+
+        if (
+            (self.flags_ & USM_ARRAY_C_CONTIGUOUS)
+            or (self.flags_ & USM_ARRAY_F_CONTIGUOUS)
+        ):
+            return (
+                self._pointer,
+                self._pointer + shape_to_elem_count(
+                    self.nd_, self.shape_
+                ) * _itemsize
+            )
+
+        for it in range(self.nd_):
+            dim_ = self.shape[it]
+            if dim_ > 0:
+                step_ = self.strides[it]
+                if step_ > 0:
+                    max_disp += step_ * (dim_ - 1)
+                else:
+                    min_disp += step_ * (dim_ - 1)
+
+        return (
+            self._pointer + min_disp * _itemsize,
+            self._pointer + (max_disp + 1) * _itemsize
+        )
+
+    cdef char* get_data(self):
+        """Returns the USM pointer for this array."""
+        return self.data_
+
+    cdef int get_ndim(self):
+        """
+        Returns the number of indices needed to address
+        an element of this array.
+        """
+        return self.nd_
+
+    cdef Py_ssize_t* get_shape(self):
+        """
+        Returns pointer to shape C-array for this array.
+
+        C-array has at least ``ndim`` non-negative elements,
+        which determine the range of permissible indices
+        addressing individual elements of this array.
+        """
+        return self.shape_
+
+    cdef Py_ssize_t* get_strides(self):
+        """
+        Returns pointer to strides C-array for this array.
+
+        The pointer can be NULL (contiguous array), or the
+        array size is at least ``ndim`` elements
+        """
+        return self.strides_
+
+    cdef int get_typenum(self):
+        """Returns typenum corresponding to values of this array"""
+        return self.typenum_
+
+    cdef int get_itemsize(self):
+        """
+        Returns itemsize of this arrays in bytes
+        """
+        return type_bytesize(self.typenum_)
+
+    cdef int get_flags(self):
+        """Returns flags of this array"""
+        return self.flags_
+
+    cdef object get_base(self):
+        """Returns the object owning the USM data addressed by this array"""
+        return self.base_
+
+    cdef c_dpctl.SyclQueue get_sycl_queue(self):
+        cdef c_dpmem._Memory mem
+        if not isinstance(self.base_, dpctl.memory._Memory):
+            raise InternalUSMArrayError(
+                "This array has unexpected memory owner"
+            )
+        mem = <c_dpmem._Memory> self.base_
+        return mem.queue
+
+    cdef c_dpctl.DPCTLSyclQueueRef get_queue_ref(self) except *:
+        """
+        Returns a copy of DPCTLSyclQueueRef associated with array
+        """
+        cdef c_dpctl.SyclQueue q = self.get_sycl_queue()
+        cdef c_dpctl.DPCTLSyclQueueRef QRef = q.get_queue_ref()
+        cdef c_dpctl.DPCTLSyclQueueRef QRefCopy = NULL
+        if QRef is not NULL:
+            QRefCopy = c_dpctl.DPCTLQueue_Copy(QRef)
+            return QRefCopy
+        else:
+            raise InternalUSMArrayError(
+                "Memory owner of this array is corrupted"
+            )
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        """
+        Gives ``__sycl_usm_array_interface__`` dictionary describing
+        the array.
+        """
+        cdef Py_ssize_t byte_offset = -1
+        cdef int item_size = -1
+        cdef Py_ssize_t elem_offset = -1
+        cdef char *mem_ptr = NULL
+        cdef char *ary_ptr = NULL
+        if (not isinstance(self.base_, dpmem._memory._Memory)):
+            raise InternalUSMArrayError(
+                "Invalid instance of usm_ndarray encountered. "
+                "Private field base_ has an unexpected type {}.".format(
+                    type(self.base_)
+                )
+            )
+        ary_iface = self.base_.__sycl_usm_array_interface__
+        mem_ptr = <char *>(<size_t> ary_iface["data"][0])
+        ary_ptr = <char *>(<size_t> self.data_)
+        ro_flag = False if (self.flags_ & USM_ARRAY_WRITABLE) else True
+        ary_iface["data"] = (<size_t> mem_ptr, ro_flag)
+        ary_iface["shape"] = self.shape
+        if (self.strides_):
+            ary_iface["strides"] = _make_int_tuple(self.nd_, self.strides_)
+        else:
+            if (self.flags_ & USM_ARRAY_C_CONTIGUOUS):
+                ary_iface["strides"] = None
+            elif (self.flags_ & USM_ARRAY_F_CONTIGUOUS):
+                ary_iface["strides"] = _f_contig_strides(self.nd_, self.shape_)
+            else:
+                raise InternalUSMArrayError(
+                    "USM Array is not contiguous and has empty strides"
+                )
+        ary_iface["typestr"] = _make_typestr(self.typenum_)
+        byte_offset = ary_ptr - mem_ptr
+        item_size = self.get_itemsize()
+        if (byte_offset % item_size):
+            raise InternalUSMArrayError(
+                "byte_offset is not a multiple of item_size.")
+        elem_offset = byte_offset // item_size
+        ary_iface["offset"] = elem_offset
+        # must wait for content of the memory to finalize
+        self.sycl_queue.wait()
+        return ary_iface
+
+    @property
+    def ndim(self):
+        """
+        Gives the number of indices needed to address elements of this array.
+        """
+        return self.nd_
+
+    @property
+    def usm_data(self):
+        """
+        Gives USM memory object underlying :class:`.usm_ndarray` instance.
+        """
+        return self.get_base()
+
+    @property
+    def shape(self):
+        """
+        Elements of the shape tuple give the lengths of the
+        respective array dimensions.
+
+        Setting shape is allowed only when reshaping to the requested
+        dimensions can be returned as view, otherwise :exc:`AttributeError`
+        is raised. Use :func:`dpctl.tensor.reshape` to reshape the array
+        in all cases.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpnp import tensor
+
+                x = tensor.arange(899)
+                x.shape = (29, 31)
+        """
+        if self.nd_ > 0:
+            return _make_int_tuple(self.nd_, self.shape_)
+        else:
+            return tuple()
+
+    @shape.setter
+    def shape(self, new_shape):
+        """
+        Modifies usm_ndarray instance in-place by changing its metadata
+        about the shape and the strides of the array, or raises
+        `AttributeError` exception if in-place change is not possible.
+
+        Args:
+            new_shape: (tuple, int)
+                New shape. Only non-negative values are supported.
+                The new shape may not lead to the change in the
+                number of elements in the array.
+
+        Whether the array can be reshape in-place depends on its
+        strides. Use :func:`dpctl.tensor.reshape` function which
+        always succeeds to reshape the array by performing a copy
+        if necessary.
+        """
+        cdef int new_nd = -1
+        cdef Py_ssize_t nelems = -1
+        cdef int err = 0
+        cdef Py_ssize_t min_disp = 0
+        cdef Py_ssize_t max_disp = 0
+        cdef int contig_flag = 0
+        cdef Py_ssize_t *shape_ptr = NULL
+        cdef Py_ssize_t *strides_ptr = NULL
+        cdef Py_ssize_t size = -1
+        import operator
+
+        from ._reshape import reshaped_strides
+
+        try:
+            new_nd = len(new_shape)
+        except TypeError:
+            new_nd = 1
+            new_shape = (new_shape,)
+        try:
+            new_shape = tuple(operator.index(dim) for dim in new_shape)
+        except TypeError:
+            raise TypeError(
+                "Target shape must be a finite iterable of integers"
+            )
+        size = shape_to_elem_count(self.nd_, self.shape_)
+        if not np.prod(new_shape) == size:
+            raise TypeError(
+                f"Can not reshape array of size {self.size} into {new_shape}"
+            )
+        if size > 0:
+            new_strides = reshaped_strides(
+               self.shape,
+               self.strides,
+               new_shape
+            )
+        else:
+            new_strides = (1,) * len(new_shape)
+        if new_strides is None:
+            raise AttributeError(
+                "Incompatible shape for in-place modification. "
+                "Use `reshape()` to make a copy with the desired shape."
+            )
+        err = _from_input_shape_strides(
+            new_nd, new_shape, new_strides,
+            self.get_itemsize(),
+            b"C",
+            &shape_ptr, &strides_ptr,
+            &nelems, &min_disp, &max_disp, &contig_flag
+        )
+        if (err == 0):
+            if (self.shape_):
+                PyMem_Free(self.shape_)
+            if (self.strides_):
+                PyMem_Free(self.strides_)
+            self.flags_ = (contig_flag | (self.flags_ & USM_ARRAY_WRITABLE))
+            self.nd_ = new_nd
+            self.shape_ = shape_ptr
+            self.strides_ = strides_ptr
+        else:
+            raise InternalUSMArrayError(
+                "Encountered in shape setter, error code {err}".format(err)
+            )
+
+    @property
+    def strides(self):
+        """
+        Returns memory displacement in array elements, upon unit
+        change of respective index.
+
+        For example, for strides ``(s1, s2, s3)`` and multi-index
+        ``(i1, i2, i3)`` position of the respective element relative
+        to zero multi-index element is ``s1*s1 + s2*i2 + s3*i3``.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpnp import tensor
+
+                x = tensor.zeros((20, 30))
+                xv = x[10:, :15]
+
+                multi_id = (3, 5)
+                byte_displacement = xv[multi_id]._pointer - xv[0, 0]._pointer
+                element_displacement = sum(
+                    i * s for i, s in zip(multi_id, xv.strides)
+                )
+                assert byte_displacement == element_displacement * xv.itemsize
+        """
+        if (self.strides_):
+            return _make_int_tuple(self.nd_, self.strides_)
+        else:
+            if (self.flags_ & USM_ARRAY_C_CONTIGUOUS):
+                return _c_contig_strides(self.nd_, self.shape_)
+            elif (self.flags_ & USM_ARRAY_F_CONTIGUOUS):
+                return _f_contig_strides(self.nd_, self.shape_)
+            else:
+                raise ValueError("Inconsistent usm_ndarray data")
+
+    @property
+    def flags(self):
+        """
+        Returns :class:`dpctl.tensor._flags.Flags` object.
+        """
+        return _flags.Flags(self, self.flags_)
+
+    cdef _set_writable_flag(self, int flag):
+        cdef int mask = (USM_ARRAY_WRITABLE if flag else 0)
+        self.flags_ = _copy_writable(self.flags_, mask)
+
+    @property
+    def usm_type(self):
+        """
+        USM type of underlying memory. Possible values are:
+
+            * ``"device"``
+                USM-device allocation in device memory, only accessible
+                to kernels executed on the device
+            * ``"shared"``
+                USM-shared allocation in device memory, accessible both
+                from the device and from host
+            * ``"host"``
+                USM-host allocation in host memory, accessible both
+                from the device and from host
+
+        See: https://docs.oneapi.com/versions/latest/dpcpp/iface/usm.html
+        """
+        return self.base_.get_usm_type()
+
+    @property
+    def itemsize(self):
+        """
+        Size of array element in bytes.
+        """
+        return self.get_itemsize()
+
+    @property
+    def nbytes(self):
+        """
+        Total bytes consumed by the elements of the array.
+        """
+        return (
+            shape_to_elem_count(self.nd_, self.shape_) *
+            self.get_itemsize())
+
+    @property
+    def size(self):
+        """
+        Number of elements in the array.
+        """
+        return shape_to_elem_count(self.nd_, self.shape_)
+
+    @property
+    def dtype(self):
+        """
+        Returns NumPy's dtype corresponding to the type of the array elements.
+        """
+        return np.dtype(_make_typestr(self.typenum_))
+
+    @property
+    def sycl_queue(self):
+        """
+        Returns :class:`dpctl.SyclQueue` object associated with USM data.
+        """
+        return self.get_sycl_queue()
+
+    @property
+    def sycl_device(self):
+        """
+        Returns :class:`dpctl.SyclDevice` object on which USM data
+        was allocated.
+        """
+        q = self.sycl_queue
+        return q.sycl_device
+
+    @property
+    def device(self):
+        """
+        Returns :class:`dpctl.tensor.Device` object representing
+        residence of the array data.
+
+        The ``Device`` object represents Array API notion of the
+        device, and contains :class:`dpctl.SyclQueue` associated
+        with this array. Hence, ``.device`` property provides
+        information distinct from ``.sycl_device`` property.
+
+        :Example:
+
+            .. code-block:: python
+
+                >>> from dpnp import tensor
+                >>> x = tensor.ones(10)
+                >>> x.device
+                Device(level_zero:gpu:0)
+        """
+        return Device.create_device(self.sycl_queue)
+
+    @property
+    def sycl_context(self):
+        """
+        Returns :class:`dpctl.SyclContext` object to which USM data is bound.
+        """
+        q = self.sycl_queue
+        return q.sycl_context
+
+    @property
+    def T(self):
+        """Returns transposed array for 2D array, raises ``ValueError``
+        otherwise.
+        """
+        if self.nd_ == 2:
+            return _transpose(self)
+        else:
+            raise ValueError(
+                "array.T requires array to have 2 dimensions. "
+                "Use array.mT to transpose stacks of matrices and "
+                "dpnp.tensor.permute_dims() to permute dimensions."
+            )
+
+    @property
+    def mT(self):
+        """ Returns array (a view) where the last two dimensions are
+        transposed.
+        """
+        if self.nd_ < 2:
+            raise ValueError(
+                "array.mT requires array to have at least 2 dimensions."
+            )
+        return _m_transpose(self)
+
+    @property
+    def real(self):
+        """
+        Returns view into real component for arrays with
+        complex data-types and returns itself for all other
+        data-types.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpnp import tensor
+
+                # Create complex array from
+                # arrays of real and imaginary parts
+
+                re = tensor.linspace(-1, 1, num=100, dtype="f4")
+                im = tensor.full_like(re, fill_value=tensor.pi)
+
+                z = tensor.empty_like(re, dtype="c8")
+                z.real[:] = re
+                z.imag[:] = im
+        """
+        # explicitly check for UAR_HALF, which is greater than UAR_CFLOAT
+        if (self.typenum_ < UAR_CFLOAT or self.typenum_ == UAR_HALF):
+            # elements are real
+            return self
+        if (self.typenum_ < UAR_TYPE_SENTINEL):
+            return _real_view(self)
+
+    @property
+    def imag(self):
+        """ Returns view into imaginary component for arrays with
+        complex data-types and returns new zero array for all other
+        data-types.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpnp import tensor
+
+                # Reset imaginary part of complex array
+
+                z = tensor.ones(100, dtype="c8")
+                z.imag[:] = dpt.pi/2
+        """
+        # explicitly check for UAR_HALF, which is greater than UAR_CFLOAT
+        if (self.typenum_ < UAR_CFLOAT or self.typenum_ == UAR_HALF):
+            # elements are real
+            return _zero_like(self)
+        if (self.typenum_ < UAR_TYPE_SENTINEL):
+            return _imag_view(self)
+
+    def __getitem__(self, ind):
+        cdef tuple _meta = _basic_slice_meta(
+            ind, (<object>self).shape, (<object> self).strides,
+            self.get_offset())
+        cdef usm_ndarray res
+        cdef int i = 0
+        cdef bint matching = 1
+
+        if len(_meta) < 5:
+            raise RuntimeError
+
+        res = usm_ndarray.__new__(
+            usm_ndarray,
+            _meta[0],
+            dtype=_make_typestr(self.typenum_),
+            strides=_meta[1],
+            buffer=self.base_,
+            offset=_meta[2]
+        )
+        res.array_namespace_ = self.array_namespace_
+
+        adv_ind = _meta[3]
+        adv_ind_start_p = _meta[4]
+
+        if adv_ind_start_p < 0:
+            res.flags_ = _copy_writable(res.flags_, self.flags_)
+            return res
+
+        from ._copy_utils import _extract_impl, _nonzero_impl, _take_multi_index
+
+        # if len(adv_ind == 1), the (only) element is always an array
+        if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
+            key_ = adv_ind[0]
+            adv_ind_end_p = key_.ndim + adv_ind_start_p
+            if adv_ind_end_p > res.ndim:
+                raise IndexError("too many indices for the array")
+            key_shape = key_.shape
+            arr_shape = res.shape[adv_ind_start_p:adv_ind_end_p]
+            for i in range(key_.ndim):
+                if matching:
+                    if not key_shape[i] == arr_shape[i] and key_shape[i] > 0:
+                        matching = 0
+            if not matching:
+                raise IndexError(
+                    "boolean index did not match indexed array in dimensions"
+                )
+            res = _extract_impl(res, key_, axis=adv_ind_start_p)
+            res.flags_ = _copy_writable(res.flags_, self.flags_)
+            return res
+
+        if any(
+            (
+                isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool
+            ) for ind in adv_ind
+        ):
+            adv_ind_int = list()
+            for ind in adv_ind:
+                if isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool:
+                    adv_ind_int.extend(_nonzero_impl(ind))
+                else:
+                    adv_ind_int.append(ind)
+            res = _take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p)
+            res.flags_ = _copy_writable(res.flags_, self.flags_)
+            return res
+
+        res = _take_multi_index(res, adv_ind, adv_ind_start_p)
+        res.flags_ = _copy_writable(res.flags_, self.flags_)
+        return res
+
+    def to_device(self, target_device, /, *, stream=None):
+        """ to_device(target_device, /, *, stream=None)
+
+        Transfers this array to specified target device.
+
+        :Example:
+            .. code-block:: python
+
+                import dpctl
+                import dpnp.tensor as dpt
+
+                x = dpt.full(10**6, 2, dtype="int64")
+                q_prof = dpctl.SyclQueue(
+                    x.sycl_device, property="enable_profiling")
+                # return a view with profile-enabled queue
+                y = x.to_device(q_prof)
+                timer = dpctl.SyclTimer()
+                with timer(q_prof):
+                    z = y * y
+                print(timer.dt)
+
+        Args:
+            target_device (object):
+                Array API concept of target device.
+                It can be a oneAPI filter selector string,
+                an instance of :class:`dpctl.SyclDevice` corresponding to a
+                non-partitioned SYCL device, an instance of
+                :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device`
+                object returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            stream (:class:`dpctl.SyclQueue`, optional):
+                Execution queue to synchronize with. If ``None``,
+                synchronization is not performed.
+
+        Returns:
+            usm_ndarray:
+                A view if data copy is not required, and a copy otherwise.
+                If copying is required, it is done by copying from the original
+                allocation device to the host, followed by copying from host
+                to the target device.
+        """
+        cdef c_dpctl.DPCTLSyclQueueRef QRef = NULL
+        cdef c_dpmem._Memory arr_buf
+        d = Device.create_device(target_device)
+
+        _validate_and_use_stream(stream, self.sycl_queue)
+
+        if (d.sycl_context == self.sycl_context):
+            arr_buf = <c_dpmem._Memory> self.usm_data
+            QRef = (<c_dpctl.SyclQueue> d.sycl_queue).get_queue_ref()
+            view_buffer = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+                <DPCTLSyclUSMRef>arr_buf.get_data_ptr(),
+                arr_buf.nbytes,
+                QRef,
+                memory_owner=arr_buf
+            )
+            res = usm_ndarray(
+                self.shape,
+                self.dtype,
+                buffer=view_buffer,
+                strides=self.strides,
+                offset=self.get_offset()
+            )
+            res.flags_ = self.flags_
+            return res
+        else:
+            nbytes = self.usm_data.nbytes
+            copy_buffer = type(self.usm_data)(
+                nbytes, queue=d.sycl_queue
+            )
+            copy_buffer.copy_from_device(self.usm_data)
+            res = usm_ndarray(
+                self.shape,
+                self.dtype,
+                buffer=copy_buffer,
+                strides=self.strides,
+                offset=self.get_offset()
+            )
+            res.flags_ = self.flags_
+            return res
+
+    def _set_namespace(self, mod):
+        """ Sets array namespace to given module `mod`. """
+        self.array_namespace_ = mod
+
+    def __array_namespace__(self, api_version=None):
+        """
+        Returns array namespace, member functions of which
+        implement data API.
+
+        Args:
+            api_version (str, optional)
+                Request namespace compliant with given version of
+                array API. If ``None``, namespace for the most
+                recent supported version is returned.
+                Default: ``None``.
+        """
+        if api_version is not None:
+            from ._array_api import __array_api_version__
+            if not isinstance(api_version, str):
+                raise TypeError(f"Expected type str, got {type(api_version)}")
+            if api_version != __array_api_version__:
+                raise ValueError(f"Only {__array_api_version__} is supported")
+        return (
+            self.array_namespace_
+            if self.array_namespace_ is not None
+            else dpnp.tensor
+        )
+
+    def __bool__(self):
+        if self.size == 1:
+            _check_0d_scalar_conversion(self)
+            view = _as_zero_dim_ndarray(self)
+            return view.__bool__()
+
+        if self.size == 0:
+            raise ValueError(
+                "The truth value of an empty array is ambiguous"
+            )
+
+        raise ValueError(
+            "The truth value of an array with more than one element is "
+            "ambiguous. Use dpnp.tensor.any() or dpnp.tensor.all()"
+        )
+
+    def __float__(self):
+        if self.size == 1:
+            _check_0d_scalar_conversion(self)
+            view = _as_zero_dim_ndarray(self)
+            return view.__float__()
+
+        raise ValueError(
+            "only size-1 arrays can be converted to Python scalars"
+        )
+
+    def __complex__(self):
+        if self.size == 1:
+            _check_0d_scalar_conversion(self)
+            view = _as_zero_dim_ndarray(self)
+            return view.__complex__()
+
+        raise ValueError(
+            "only size-1 arrays can be converted to Python scalars"
+        )
+
+    def __int__(self):
+        if self.size == 1:
+            _check_0d_scalar_conversion(self)
+            view = _as_zero_dim_ndarray(self)
+            return view.__int__()
+
+        raise ValueError(
+            "only size-1 arrays can be converted to Python scalars"
+        )
+
+    def __index__(self):
+        if np.issubdtype(self.dtype, np.integer):
+            return int(self)
+
+        raise IndexError("only integer arrays are valid indices")
+
+    def __abs__(self):
+        return dpnp.tensor.abs(self)
+
+    def __add__(self, other):
+        """
+        Implementation for operator.add
+        """
+        return dpnp.tensor.add(self, other)
+
+    def __and__(self, other):
+        "Implementation for operator.and"
+        return dpnp.tensor.bitwise_and(self, other)
+
+    def __dlpack__(
+        self, *, stream=None, max_version=None, dl_device=None, copy=None
+    ):
+        """
+        Produces DLPack capsule.
+
+        Args:
+            stream (:class:`dpctl.SyclQueue`, optional):
+                Execution queue to synchronize with.
+                If ``None``, synchronization is not performed.
+                Default: ``None``.
+            max_version (tuple[int, int], optional):
+                The maximum DLPack version the consumer (caller of
+                ``__dlpack__``) supports. As ``__dlpack__`` may not
+                always return a DLPack capsule with version
+                `max_version`, the consumer must verify the version
+                even if this argument is passed.
+                Default: ``None``.
+            dl_device (tuple[enum.Enum, int], optional):
+                The device the returned DLPack capsule will be
+                placed on.
+                The device must be a 2-tuple matching the format of
+                ``__dlpack_device__`` method, an integer enumerator
+                representing the device type followed by an integer
+                representing the index of the device.
+                Default: ``None``.
+            copy (bool, optional):
+                Boolean indicating whether or not to copy the input.
+
+                * If ``copy`` is ``True``, the input will always be
+                  copied.
+                * If ``False``, a ``BufferError`` will be raised if a
+                  copy is deemed necessary.
+                * If ``None``, a copy will be made only if deemed
+                  necessary, otherwise, the existing memory buffer will
+                  be reused.
+
+                Default: ``None``.
+
+        Raises:
+            MemoryError:
+                when host memory can not be allocated.
+            DLPackCreationError:
+                when array is allocated on a partitioned
+                SYCL device, or with a non-default context.
+            BufferError:
+                when a copy is deemed necessary but ``copy``
+                is ``False`` or when the provided ``dl_device``
+                cannot be handled.
+        """
+        if max_version is None:
+            # legacy path for DLManagedTensor
+            # copy kwarg ignored because copy flag can't be set
+            _caps = c_dlpack.to_dlpack_capsule(self)
+            _validate_and_use_stream(stream, self.sycl_queue)
+            return _caps
+        else:
+            if not isinstance(max_version, tuple) or len(max_version) != 2:
+                raise TypeError(
+                    "`__dlpack__` expects `max_version` to be a "
+                    "2-tuple of integers `(major, minor)`, instead "
+                    f"got {max_version}"
+                )
+            dpctl_dlpack_version = get_build_dlpack_version()
+            if max_version[0] >= dpctl_dlpack_version[0]:
+                # DLManagedTensorVersioned path
+                if dl_device is not None:
+                    if not isinstance(dl_device, tuple) or len(dl_device) != 2:
+                        raise TypeError(
+                            "`__dlpack__` expects `dl_device` to be a 2-tuple "
+                            "of `(device_type, device_id)`, instead "
+                            f"got {dl_device}"
+                        )
+                    if dl_device != self.__dlpack_device__():
+                        if copy is False:
+                            raise BufferError(
+                                "array cannot be placed on the requested "
+                                "device without a copy"
+                            )
+                        if _is_host_cpu(dl_device):
+                            if stream is not None:
+                                raise ValueError(
+                                    "`stream` must be `None` when `dl_device` "
+                                    "is of type `kDLCPU`"
+                                )
+                            from ._copy_utils import _copy_to_numpy
+                            _arr = _copy_to_numpy(self)
+                            _arr.flags["W"] = self.flags["W"]
+                            return c_dlpack.numpy_to_dlpack_versioned_capsule(
+                                _arr, True
+                            )
+                        else:
+                            raise BufferError(
+                                f"targeting `dl_device` {dl_device} with "
+                                "`__dlpack__` is not yet implemented"
+                            )
+                if copy is None:
+                    copy = False
+                # TODO: strategy for handling stream on different device
+                # from dl_device
+                if copy:
+                    _validate_and_use_stream(stream, self.sycl_queue)
+                    nbytes = self.usm_data.nbytes
+                    copy_buffer = type(self.usm_data)(
+                        nbytes, queue=self.sycl_queue
+                    )
+                    copy_buffer.copy_from_device(self.usm_data)
+                    _copied_arr = usm_ndarray(
+                        self.shape,
+                        self.dtype,
+                        buffer=copy_buffer,
+                        strides=self.strides,
+                        offset=self.get_offset()
+                    )
+                    _copied_arr.flags_ = self.flags_
+                    _caps = c_dlpack.to_dlpack_versioned_capsule(
+                        _copied_arr, copy
+                    )
+                else:
+                    _caps = c_dlpack.to_dlpack_versioned_capsule(self, copy)
+                    _validate_and_use_stream(stream, self.sycl_queue)
+                return _caps
+            else:
+                # legacy path for DLManagedTensor
+                _caps = c_dlpack.to_dlpack_capsule(self)
+                _validate_and_use_stream(stream, self.sycl_queue)
+                return _caps
+
+    def __dlpack_device__(self):
+        """
+        Gives a tuple (``device_type``, ``device_id``) corresponding to
+        ``DLDevice`` entry in ``DLTensor`` in DLPack protocol.
+
+        The tuple describes the non-partitioned device where the array has been
+        allocated, or the non-partitioned parent device of the allocation
+        device.
+
+        See :class:`dpctl.tensor.DLDeviceType` for a list of devices supported
+        by the DLPack protocol.
+
+        Raises:
+            DLPackCreationError:
+                when the ``device_id`` could not be determined.
+        """
+        try:
+            dev_id = self.sycl_device.get_device_id()
+        except ValueError as e:
+            raise c_dlpack.DLPackCreationError(
+                "Could not determine id of the device where array was "
+                "allocated."
+            )
+        return (
+            DLDeviceType.kDLOneAPI,
+            dev_id,
+        )
+
+    def __eq__(self, other):
+        return dpnp.tensor.equal(self, other)
+
+    def __floordiv__(self, other):
+        return dpnp.tensor.floor_divide(self, other)
+
+    def __ge__(self, other):
+        return dpnp.tensor.greater_equal(self, other)
+
+    def __gt__(self, other):
+        return dpnp.tensor.greater(self, other)
+
+    def __invert__(self):
+        return dpnp.tensor.bitwise_invert(self)
+
+    def __le__(self, other):
+        return dpnp.tensor.less_equal(self, other)
+
+    def __len__(self):
+        if (self.nd_):
+            return self.shape[0]
+        else:
+            raise TypeError("len() of unsized object")
+
+    def __lshift__(self, other):
+        return dpnp.tensor.bitwise_left_shift(self, other)
+
+    def __lt__(self, other):
+        return dpnp.tensor.less(self, other)
+
+    def __matmul__(self, other):
+        return dpnp.tensor.matmul(self, other)
+
+    def __mod__(self, other):
+        return dpnp.tensor.remainder(self, other)
+
+    def __mul__(self, other):
+        return dpnp.tensor.multiply(self, other)
+
+    def __ne__(self, other):
+        return dpnp.tensor.not_equal(self, other)
+
+    def __neg__(self):
+        return dpnp.tensor.negative(self)
+
+    def __or__(self, other):
+        return dpnp.tensor.bitwise_or(self, other)
+
+    def __pos__(self):
+        return dpnp.tensor.positive(self)
+
+    def __pow__(self, other):
+        return dpnp.tensor.pow(self, other)
+
+    def __rshift__(self, other):
+        return dpnp.tensor.bitwise_right_shift(self, other)
+
+    def __setitem__(self, key, rhs):
+        cdef tuple _meta
+        cdef usm_ndarray Xv
+
+        if (self.flags_ & USM_ARRAY_WRITABLE) == 0:
+            raise ValueError("Can not modify read-only array.")
+
+        _meta = _basic_slice_meta(
+            key, (<object>self).shape, (<object> self).strides,
+            self.get_offset()
+        )
+
+        if len(_meta) < 5:
+            raise RuntimeError
+
+        Xv = usm_ndarray.__new__(
+            usm_ndarray,
+            _meta[0],
+            dtype=_make_typestr(self.typenum_),
+            strides=_meta[1],
+            buffer=self.base_,
+            offset=_meta[2],
+        )
+        # set namespace
+        Xv.array_namespace_ = self.array_namespace_
+
+        from ._copy_utils import (
+            _copy_from_numpy_into,
+            _copy_from_usm_ndarray_to_usm_ndarray,
+            _nonzero_impl,
+            _place_impl,
+            _put_multi_index,
+        )
+
+        adv_ind = _meta[3]
+        adv_ind_start_p = _meta[4]
+
+        if adv_ind_start_p < 0:
+            # basic slicing
+            if isinstance(rhs, usm_ndarray):
+                _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs)
+            else:
+                if hasattr(rhs, "__sycl_usm_array_interface__"):
+                    from dpnp.tensor import asarray
+                    try:
+                        rhs_ar = asarray(rhs)
+                        _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs_ar)
+                    except Exception:
+                        raise ValueError(
+                            f"Input of type {type(rhs)} could not be "
+                            "converted to usm_ndarray"
+                        )
+                else:
+                    rhs_np = np.asarray(rhs)
+                    if type_bytesize(rhs_np.dtype.num) < 0:
+                        raise ValueError(
+                            f"Input of type {type(rhs)} can not be "
+                            "assigned to usm_ndarray because of "
+                            f"unsupported data type '{rhs_np.dtype}'"
+                        )
+                    try:
+                        _copy_from_numpy_into(Xv, rhs_np)
+                    except Exception:
+                        raise ValueError(
+                            f"Input of type {type(rhs)} could not be "
+                            "copied into dpnp.tensor.usm_ndarray"
+                        )
+            return
+
+        if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
+            _place_impl(Xv, adv_ind[0], rhs, axis=adv_ind_start_p)
+            return
+
+        if any(
+            (
+                isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool
+            ) for ind in adv_ind
+        ):
+            adv_ind_int = list()
+            for ind in adv_ind:
+                if isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool:
+                    adv_ind_int.extend(_nonzero_impl(ind))
+                else:
+                    adv_ind_int.append(ind)
+            _put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs)
+            return
+
+        _put_multi_index(Xv, adv_ind, adv_ind_start_p, rhs)
+        return
+
+    def __sub__(self, other):
+        return dpnp.tensor.subtract(self, other)
+
+    def __truediv__(self, other):
+        return dpnp.tensor.divide(self, other)
+
+    def __xor__(self, other):
+        return dpnp.tensor.bitwise_xor(self, other)
+
+    def __radd__(self, other):
+        return dpnp.tensor.add(other, self)
+
+    def __rand__(self, other):
+        return dpnp.tensor.bitwise_and(other, self)
+
+    def __rfloordiv__(self, other):
+        return dpnp.tensor.floor_divide(other, self)
+
+    def __rlshift__(self, other):
+        return dpnp.tensor.bitwise_left_shift(other, self)
+
+    def __rmatmul__(self, other):
+        return dpnp.tensor.matmul(other, self)
+
+    def __rmod__(self, other):
+        return dpnp.tensor.remainder(other, self)
+
+    def __rmul__(self, other):
+        return dpnp.tensor.multiply(other, self)
+
+    def __ror__(self, other):
+        return dpnp.tensor.bitwise_or(other, self)
+
+    def __rpow__(self, other):
+        return dpnp.tensor.pow(other, self)
+
+    def __rrshift__(self, other):
+        return dpnp.tensor.bitwise_right_shift(other, self)
+
+    def __rsub__(self, other):
+        return dpnp.tensor.subtract(other, self)
+
+    def __rtruediv__(self, other):
+        return dpnp.tensor.divide(other, self)
+
+    def __rxor__(self, other):
+        return dpnp.tensor.bitwise_xor(other, self)
+
+    def __iadd__(self, other):
+        return dpnp.tensor.add._inplace_op(self, other)
+
+    def __iand__(self, other):
+        return dpnp.tensor.bitwise_and._inplace_op(self, other)
+
+    def __ifloordiv__(self, other):
+        return dpnp.tensor.floor_divide._inplace_op(self, other)
+
+    def __ilshift__(self, other):
+        return dpnp.tensor.bitwise_left_shift._inplace_op(self, other)
+
+    def __imatmul__(self, other):
+        return dpnp.tensor.matmul(self, other, out=self, dtype=self.dtype)
+
+    def __imod__(self, other):
+        return dpnp.tensor.remainder._inplace_op(self, other)
+
+    def __imul__(self, other):
+        return dpnp.tensor.multiply._inplace_op(self, other)
+
+    def __ior__(self, other):
+        return dpnp.tensor.bitwise_or._inplace_op(self, other)
+
+    def __ipow__(self, other):
+        return dpnp.tensor.pow._inplace_op(self, other)
+
+    def __irshift__(self, other):
+        return dpnp.tensor.bitwise_right_shift._inplace_op(self, other)
+
+    def __isub__(self, other):
+        return dpnp.tensor.subtract._inplace_op(self, other)
+
+    def __itruediv__(self, other):
+        return dpnp.tensor.divide._inplace_op(self, other)
+
+    def __ixor__(self, other):
+        return dpnp.tensor.bitwise_xor._inplace_op(self, other)
+
+    def __str__(self):
+        return usm_ndarray_str(self)
+
+    def __repr__(self):
+        return usm_ndarray_repr(self)
+
+    def __array__(self, dtype=None, /, *, copy=None):
+        """NumPy's array protocol method to disallow implicit conversion.
+
+        Without this definition, `numpy.asarray(usm_ar)` converts
+        usm_ndarray instance into NumPy array with data type `object`
+        and every element being 0d usm_ndarray.
+
+        https://github.com/IntelPython/dpctl/pull/1384#issuecomment-1707212972
+        """
+        raise TypeError(
+            "Implicit conversion to a NumPy array is not allowed. "
+            "Use `dpnp.tensor.asnumpy` to copy data from this "
+            "`dpnp.tensor.usm_ndarray` instance to NumPy array"
+        )
+
+
+cdef usm_ndarray _real_view(usm_ndarray ary):
+    """
+    View into real parts of a complex type array
+    """
+    cdef int r_typenum_ = -1
+    cdef usm_ndarray r = None
+    cdef Py_ssize_t offset_elems = 0
+
+    if (ary.typenum_ == UAR_CFLOAT):
+        r_typenum_ = UAR_FLOAT
+    elif (ary.typenum_ == UAR_CDOUBLE):
+        r_typenum_ = UAR_DOUBLE
+    else:
+        raise InternalUSMArrayError(
+            "_real_view call on array of non-complex type.")
+
+    offset_elems = ary.get_offset() * 2
+    r = usm_ndarray.__new__(
+        usm_ndarray,
+        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
+        dtype=_make_typestr(r_typenum_),
+        strides=tuple(2 * si for si in ary.strides),
+        buffer=ary.base_,
+        offset=offset_elems,
+        order=("C" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "F")
+    )
+    r.flags_ = _copy_writable(r.flags_, ary.flags_)
+    r.array_namespace_ = ary.array_namespace_
+    return r
+
+
+cdef usm_ndarray _imag_view(usm_ndarray ary):
+    """
+    View into imaginary parts of a complex type array
+    """
+    cdef int r_typenum_ = -1
+    cdef usm_ndarray r = None
+    cdef Py_ssize_t offset_elems = 0
+
+    if (ary.typenum_ == UAR_CFLOAT):
+        r_typenum_ = UAR_FLOAT
+    elif (ary.typenum_ == UAR_CDOUBLE):
+        r_typenum_ = UAR_DOUBLE
+    else:
+        raise InternalUSMArrayError(
+            "_imag_view call on array of non-complex type.")
+
+    # displace pointer to imaginary part
+    offset_elems = 2 * ary.get_offset() + 1
+    r = usm_ndarray.__new__(
+        usm_ndarray,
+        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
+        dtype=_make_typestr(r_typenum_),
+        strides=tuple(2 * si for si in ary.strides),
+        buffer=ary.base_,
+        offset=offset_elems,
+        order=("C" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "F")
+    )
+    r.flags_ = _copy_writable(r.flags_, ary.flags_)
+    r.array_namespace_ = ary.array_namespace_
+    return r
+
+
+cdef usm_ndarray _transpose(usm_ndarray ary):
+    """
+    Construct transposed array without copying the data
+    """
+    cdef usm_ndarray r = usm_ndarray.__new__(
+        usm_ndarray,
+        _make_reversed_int_tuple(ary.nd_, ary.shape_),
+        dtype=_make_typestr(ary.typenum_),
+        strides=(
+            _make_reversed_int_tuple(ary.nd_, ary.strides_)
+            if (ary.strides_) else None),
+        buffer=ary.base_,
+        order=("F" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "C"),
+        offset=ary.get_offset()
+    )
+    r.flags_ = _copy_writable(r.flags_, ary.flags_)
+    return r
+
+
+cdef usm_ndarray _m_transpose(usm_ndarray ary):
+    """
+    Construct matrix transposed array
+    """
+    cdef usm_ndarray r = usm_ndarray.__new__(
+        usm_ndarray,
+        _swap_last_two(_make_int_tuple(ary.nd_, ary.shape_)),
+        dtype=_make_typestr(ary.typenum_),
+        strides=_swap_last_two(ary.strides),
+        buffer=ary.base_,
+        order=("F" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "C"),
+        offset=ary.get_offset()
+    )
+    r.flags_ = _copy_writable(r.flags_, ary.flags_)
+    return r
+
+
+cdef usm_ndarray _zero_like(usm_ndarray ary):
+    """
+    Make C-contiguous array of zero elements with same shape,
+    type, device, and sycl_queue as ary.
+    """
+    cdef dt = _make_typestr(ary.typenum_)
+    cdef usm_ndarray r = usm_ndarray(
+        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
+        dtype=dt,
+        buffer=ary.base_.get_usm_type(),
+        buffer_ctor_kwargs={"queue": ary.get_sycl_queue()},
+    )
+    r.base_.memset()
+    return r
+
+
+def _is_object_with_buffer_protocol(o):
+    "Returns True if object supports Python buffer protocol"
+    return _is_buffer(o)
diff --git a/dpnp/tensor/_utility_functions.py b/dpnp/tensor/_utility_functions.py
new file mode 100644
index 000000000000..651ce0830266
--- /dev/null
+++ b/dpnp/tensor/_utility_functions.py
@@ -0,0 +1,506 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import builtins
+import operator
+
+import dpctl.utils as du
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+import dpnp.tensor._tensor_reductions_impl as tri
+
+from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
+from ._scalar_utils import (
+    _get_dtype,
+    _get_queue_usm_type,
+    _get_shape,
+    _validate_dtype,
+)
+from ._type_utils import (
+    _resolve_one_strong_one_weak_types,
+    _resolve_one_strong_two_weak_types,
+)
+
+
+def _boolean_reduction(x, axis, keepdims, func):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpnp.tensor.usm_ndarray, got {type(x)}")
+
+    nd = x.ndim
+    if axis is None:
+        red_nd = nd
+        # case of a scalar
+        if red_nd == 0:
+            return dpt.astype(x, dpt.bool)
+        x_tmp = x
+        res_shape = ()
+        perm = list(range(nd))
+    else:
+        if not isinstance(axis, (tuple, list)):
+            axis = (axis,)
+        axis = normalize_axis_tuple(axis, nd, "axis")
+
+        red_nd = len(axis)
+        # check for axis=()
+        if red_nd == 0:
+            return dpt.astype(x, dpt.bool)
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        x_tmp = dpt.permute_dims(x, perm)
+        res_shape = x_tmp.shape[: nd - red_nd]
+
+    exec_q = x.sycl_queue
+    res_usm_type = x.usm_type
+
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    # always allocate the temporary as
+    # int32 and usm-device  to ensure that atomic updates
+    # are supported
+    res_tmp = dpt.empty(
+        res_shape,
+        dtype=dpt.int32,
+        usm_type="device",
+        sycl_queue=exec_q,
+    )
+    hev0, ev0 = func(
+        src=x_tmp,
+        trailing_dims_to_reduce=red_nd,
+        dst=res_tmp,
+        sycl_queue=exec_q,
+        depends=dep_evs,
+    )
+    _manager.add_event_pair(hev0, ev0)
+
+    # copy to boolean result array
+    res = dpt.empty(
+        res_shape,
+        dtype=dpt.bool,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+    )
+    hev1, ev1 = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=res_tmp, dst=res, sycl_queue=exec_q, depends=[ev0]
+    )
+    _manager.add_event_pair(hev1, ev1)
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
+    return res
+
+
+def all(x, /, *, axis=None, keepdims=False):
+    """
+    all(x, axis=None, keepdims=False)
+
+    Tests whether all input array elements evaluate to True along a given axis.
+
+    Args:
+        x (usm_ndarray): Input array.
+        axis (Optional[Union[int, Tuple[int,...]]]): Axis (or axes)
+            along which to perform a logical AND reduction.
+            When `axis` is `None`, a logical AND reduction
+            is performed over all dimensions of `x`.
+            If `axis` is negative, the axis is counted from
+            the last dimension to the first.
+            Default: `None`.
+        keepdims (bool, optional): If `True`, the reduced axes are included
+            in the result as singleton dimensions, and the result is
+            broadcastable to the input array shape.
+            If `False`, the reduced axes are not included in the result.
+            Default: `False`.
+
+    Returns:
+        usm_ndarray:
+            An array with a data type of `bool`
+            containing the results of the logical AND reduction.
+    """
+    return _boolean_reduction(x, axis, keepdims, tri._all)
+
+
+def any(x, /, *, axis=None, keepdims=False):
+    """
+    any(x, axis=None, keepdims=False)
+
+    Tests whether any input array elements evaluate to True along a given axis.
+
+    Args:
+        x (usm_ndarray): Input array.
+        axis (Optional[Union[int, Tuple[int,...]]]): Axis (or axes)
+            along which to perform a logical OR reduction.
+            When `axis` is `None`, a logical OR reduction
+            is performed over all dimensions of `x`.
+            If `axis` is negative, the axis is counted from
+            the last dimension to the first.
+            Default: `None`.
+        keepdims (bool, optional): If `True`, the reduced axes are included
+            in the result as singleton dimensions, and the result is
+            broadcastable to the input array shape.
+            If `False`, the reduced axes are not included in the result.
+            Default: `False`.
+
+    Returns:
+        usm_ndarray:
+            An array with a data type of `bool`
+            containing the results of the logical OR reduction.
+    """
+    return _boolean_reduction(x, axis, keepdims, tri._any)
+
+
+def _validate_diff_shape(sh1, sh2, axis):
+    """
+    Utility for validating that two shapes `sh1` and `sh2`
+    are possible to concatenate along `axis`.
+    """
+    if not sh2:
+        # scalars will always be accepted
+        return True
+    else:
+        sh1_ndim = len(sh1)
+        if sh1_ndim == len(sh2) and builtins.all(
+            sh1[i] == sh2[i] for i in range(sh1_ndim) if i != axis
+        ):
+            return True
+        else:
+            return False
+
+
+def _concat_diff_input(arr, axis, prepend, append):
+    """
+    Concatenates `arr`, `prepend` and, `append` along `axis`,
+    where `arr` is an array and `prepend` and `append` are
+    any mixture of arrays and scalars.
+    """
+    if prepend is not None and append is not None:
+        q1, x_usm_type = arr.sycl_queue, arr.usm_type
+        q2, prepend_usm_type = _get_queue_usm_type(prepend)
+        q3, append_usm_type = _get_queue_usm_type(append)
+        if q2 is None and q3 is None:
+            exec_q = q1
+            coerced_usm_type = x_usm_type
+        elif q3 is None:
+            exec_q = dpt.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise dpt.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = dpt.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    prepend_usm_type,
+                )
+            )
+        elif q2 is None:
+            exec_q = dpt.get_execution_queue((q1, q3))
+            if exec_q is None:
+                raise dpt.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = dpt.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    append_usm_type,
+                )
+            )
+        else:
+            exec_q = dpt.get_execution_queue((q1, q2, q3))
+            if exec_q is None:
+                raise dpt.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = dpt.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    prepend_usm_type,
+                    append_usm_type,
+                )
+            )
+        dpt.validate_usm_type(coerced_usm_type, allow_none=False)
+        arr_shape = arr.shape
+        prepend_shape = _get_shape(prepend)
+        append_shape = _get_shape(append)
+        if not builtins.all(
+            isinstance(s, (tuple, list))
+            for s in (
+                prepend_shape,
+                append_shape,
+            )
+        ):
+            raise TypeError(
+                "Shape of arguments can not be inferred. "
+                "Arguments are expected to be "
+                "lists, tuples, or both"
+            )
+        valid_prepend_shape = _validate_diff_shape(
+            arr_shape, prepend_shape, axis
+        )
+        if not valid_prepend_shape:
+            raise ValueError(
+                f"`diff` argument `prepend` with shape {prepend_shape} is "
+                f"invalid for first input with shape {arr_shape}"
+            )
+        valid_append_shape = _validate_diff_shape(arr_shape, append_shape, axis)
+        if not valid_append_shape:
+            raise ValueError(
+                f"`diff` argument `append` with shape {append_shape} is invalid"
+                f" for first input with shape {arr_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        arr_dtype = arr.dtype
+        prepend_dtype = _get_dtype(prepend, sycl_dev)
+        append_dtype = _get_dtype(append, sycl_dev)
+        if not builtins.all(
+            _validate_dtype(o) for o in (prepend_dtype, append_dtype)
+        ):
+            raise ValueError("Operands have unsupported data types")
+        prepend_dtype, append_dtype = _resolve_one_strong_two_weak_types(
+            arr_dtype, prepend_dtype, append_dtype, sycl_dev
+        )
+        if isinstance(prepend, dpt.usm_ndarray):
+            a_prepend = prepend
+        else:
+            a_prepend = dpt.asarray(
+                prepend,
+                dtype=prepend_dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        if isinstance(append, dpt.usm_ndarray):
+            a_append = append
+        else:
+            a_append = dpt.asarray(
+                append,
+                dtype=append_dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        if not prepend_shape:
+            prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
+            a_prepend = dpt.broadcast_to(a_prepend, prepend_shape)
+        if not append_shape:
+            append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
+            a_append = dpt.broadcast_to(a_append, append_shape)
+        return dpt.concat((a_prepend, arr, a_append), axis=axis)
+    elif prepend is not None:
+        q1, x_usm_type = arr.sycl_queue, arr.usm_type
+        q2, prepend_usm_type = _get_queue_usm_type(prepend)
+        if q2 is None:
+            exec_q = q1
+            coerced_usm_type = x_usm_type
+        else:
+            exec_q = dpt.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise dpt.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = dpt.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    prepend_usm_type,
+                )
+            )
+        dpt.validate_usm_type(coerced_usm_type, allow_none=False)
+        arr_shape = arr.shape
+        prepend_shape = _get_shape(prepend)
+        if not isinstance(prepend_shape, (tuple, list)):
+            raise TypeError(
+                "Shape of argument can not be inferred. "
+                "Argument is expected to be a "
+                "list or tuple"
+            )
+        valid_prepend_shape = _validate_diff_shape(
+            arr_shape, prepend_shape, axis
+        )
+        if not valid_prepend_shape:
+            raise ValueError(
+                f"`diff` argument `prepend` with shape {prepend_shape} is "
+                f"invalid for first input with shape {arr_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        arr_dtype = arr.dtype
+        prepend_dtype = _get_dtype(prepend, sycl_dev)
+        if not _validate_dtype(prepend_dtype):
+            raise ValueError("Operand has unsupported data type")
+        prepend_dtype = _resolve_one_strong_one_weak_types(
+            arr_dtype, prepend_dtype, sycl_dev
+        )
+        if isinstance(prepend, dpt.usm_ndarray):
+            a_prepend = prepend
+        else:
+            a_prepend = dpt.asarray(
+                prepend,
+                dtype=prepend_dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        if not prepend_shape:
+            prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
+            a_prepend = dpt.broadcast_to(a_prepend, prepend_shape)
+        return dpt.concat((a_prepend, arr), axis=axis)
+    elif append is not None:
+        q1, x_usm_type = arr.sycl_queue, arr.usm_type
+        q2, append_usm_type = _get_queue_usm_type(append)
+        if q2 is None:
+            exec_q = q1
+            coerced_usm_type = x_usm_type
+        else:
+            exec_q = dpt.get_execution_queue((q1, q2))
+            if exec_q is None:
+                raise dpt.ExecutionPlacementError(
+                    "Execution placement can not be unambiguously inferred "
+                    "from input arguments."
+                )
+            coerced_usm_type = dpt.get_coerced_usm_type(
+                (
+                    x_usm_type,
+                    append_usm_type,
+                )
+            )
+        dpt.validate_usm_type(coerced_usm_type, allow_none=False)
+        arr_shape = arr.shape
+        append_shape = _get_shape(append)
+        if not isinstance(append_shape, (tuple, list)):
+            raise TypeError(
+                "Shape of argument can not be inferred. "
+                "Argument is expected to be a "
+                "list or tuple"
+            )
+        valid_append_shape = _validate_diff_shape(arr_shape, append_shape, axis)
+        if not valid_append_shape:
+            raise ValueError(
+                f"`diff` argument `append` with shape {append_shape} is invalid"
+                f" for first input with shape {arr_shape}"
+            )
+        sycl_dev = exec_q.sycl_device
+        arr_dtype = arr.dtype
+        append_dtype = _get_dtype(append, sycl_dev)
+        if not _validate_dtype(append_dtype):
+            raise ValueError("Operand has unsupported data type")
+        append_dtype = _resolve_one_strong_one_weak_types(
+            arr_dtype, append_dtype, sycl_dev
+        )
+        if isinstance(append, dpt.usm_ndarray):
+            a_append = append
+        else:
+            a_append = dpt.asarray(
+                append,
+                dtype=append_dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        if not append_shape:
+            append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
+            a_append = dpt.broadcast_to(a_append, append_shape)
+        return dpt.concat((arr, a_append), axis=axis)
+    else:
+        arr1 = arr
+    return arr1
+
+
+def diff(x, /, *, axis=-1, n=1, prepend=None, append=None):
+    """
+    Calculates the `n`-th discrete forward difference of `x` along `axis`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (int):
+            axis along which to compute the difference. A valid axis must be on
+            the interval `[-N, N)`, where `N` is the rank (number of
+            dimensions) of `x`.
+            Default: `-1`
+        n (int):
+            number of times to recursively compute the difference.
+            Default: `1`.
+        prepend (Union[usm_ndarray, bool, int, float, complex]):
+            value or values to prepend to the specified axis before taking the
+            difference.
+            Must have the same shape as `x` except along `axis`, which can have
+            any shape.
+            Default: `None`.
+        append (Union[usm_ndarray, bool, int, float, complex]):
+            value or values to append to the specified axis before taking the
+            difference.
+            Must have the same shape as `x` except along `axis`, which can have
+            any shape.
+            Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            an array containing the `n`-th differences. The array will have the
+            same shape as `x`, except along `axis`, which will have shape:
+            ``prepend.shape[axis] + x.shape[axis] + append.shape[axis] - n``
+
+            The data type of the returned array is determined by the Type
+            Promotion Rules.
+    """
+
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expecting dpnp.tensor.usm_ndarray type, " f"got {type(x)}"
+        )
+    x_nd = x.ndim
+    axis = normalize_axis_index(operator.index(axis), x_nd)
+    n = operator.index(n)
+    if n < 0:
+        raise ValueError(f"`n` must be positive, got {n}")
+    arr = _concat_diff_input(x, axis, prepend, append)
+    if n == 0:
+        return arr
+    # form slices and recurse
+    sl0 = tuple(
+        slice(None) if i != axis else slice(1, None) for i in range(x_nd)
+    )
+    sl1 = tuple(
+        slice(None) if i != axis else slice(None, -1) for i in range(x_nd)
+    )
+
+    diff_op = dpt.not_equal if x.dtype == dpt.bool else dpt.subtract
+    if n > 1:
+        arr_tmp0 = diff_op(arr[sl0], arr[sl1])
+        arr_tmp1 = diff_op(arr_tmp0[sl0], arr_tmp0[sl1])
+        n = n - 2
+        if n > 0:
+            sl3 = tuple(
+                slice(None) if i != axis else slice(None, -2)
+                for i in range(x_nd)
+            )
+            for _ in range(n):
+                arr_tmp0_sliced = arr_tmp0[sl3]
+                diff_op(arr_tmp1[sl0], arr_tmp1[sl1], out=arr_tmp0_sliced)
+                arr_tmp0, arr_tmp1 = arr_tmp1, arr_tmp0_sliced
+        arr = arr_tmp1
+    else:
+        arr = diff_op(arr[sl0], arr[sl1])
+    return arr
diff --git a/dpnp/tensor/include/dlpack/LICENSE.third-party b/dpnp/tensor/include/dlpack/LICENSE.third-party
new file mode 100644
index 000000000000..20a9c8a7b4dc
--- /dev/null
+++ b/dpnp/tensor/include/dlpack/LICENSE.third-party
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2017 by Contributors
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/dpnp/tensor/include/dlpack/README.md b/dpnp/tensor/include/dlpack/README.md
new file mode 100644
index 000000000000..315ad1b9a566
--- /dev/null
+++ b/dpnp/tensor/include/dlpack/README.md
@@ -0,0 +1,7 @@
+# DLPack header
+
+The header `dlpack.h` downloaded from `https://github.com/dmlc/dlpack.git` remote at tag v1.3 commit [`84d107b`](https://github.com/dmlc/dlpack/commit/84d107bf416c6bab9ae68ad285876600d230490d).
+
+The file can also be viewed using github web interface at https://github.com/dmlc/dlpack/blob/v1.3/include/dlpack/dlpack.h
+
+License file was retrieved from https://github.com/dmlc/dlpack/blob/main/LICENSE
diff --git a/dpnp/tensor/include/dlpack/dlpack.h b/dpnp/tensor/include/dlpack/dlpack.h
new file mode 100644
index 000000000000..5196acc87711
--- /dev/null
+++ b/dpnp/tensor/include/dlpack/dlpack.h
@@ -0,0 +1,683 @@
+/*!
+ *  Copyright (c) 2017 -  by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+/**
+ * \brief Compatibility with C++
+ */
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current major version of dlpack */
+#define DLPACK_MAJOR_VERSION 1
+
+/*! \brief The current minor version of dlpack */
+#define DLPACK_MINOR_VERSION 3
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+    /*!
+     * \brief The DLPack version.
+     *
+     * A change in major version indicates that we have changed the
+     * data layout of the ABI - DLManagedTensorVersioned.
+     *
+     * A change in minor version indicates that we have added new
+     * code, such as a new device type, but the ABI is kept the same.
+     *
+     * If an obtained DLPack tensor has a major version that disagrees
+     * with the version number specified in this header file
+     * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
+     * (and it is safe to do so). It is not safe to access any other fields
+     * as the memory layout will have changed.
+     *
+     * In the case of a minor version mismatch, the tensor can be safely used as
+     * long as the consumer knows how to interpret all fields. Minor version
+     * updates indicate the addition of enumeration values.
+     */
+    typedef struct
+    {
+        /*! \brief DLPack major version. */
+        uint32_t major;
+        /*! \brief DLPack minor version. */
+        uint32_t minor;
+    } DLPackVersion;
+
+/*!
+ * \brief The device type in DLDevice.
+ */
+#ifdef __cplusplus
+    typedef enum : int32_t
+    {
+#else
+typedef enum
+{
+#endif
+        /*! \brief CPU device */
+        kDLCPU = 1,
+        /*! \brief CUDA GPU device */
+        kDLCUDA = 2,
+        /*!
+         * \brief Pinned CUDA CPU memory by cudaMallocHost
+         */
+        kDLCUDAHost = 3,
+        /*! \brief OpenCL devices. */
+        kDLOpenCL = 4,
+        /*! \brief Vulkan buffer for next generation graphics. */
+        kDLVulkan = 7,
+        /*! \brief Metal for Apple GPU. */
+        kDLMetal = 8,
+        /*! \brief Verilog simulator buffer */
+        kDLVPI = 9,
+        /*! \brief ROCm GPUs for AMD GPUs */
+        kDLROCM = 10,
+        /*!
+         * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+         */
+        kDLROCMHost = 11,
+        /*!
+         * \brief Reserved extension device type,
+         * used for quickly test extension device
+         * The semantics can differ depending on the implementation.
+         */
+        kDLExtDev = 12,
+        /*!
+         * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+         */
+        kDLCUDAManaged = 13,
+        /*!
+         * \brief Unified shared memory allocated on a oneAPI non-partititioned
+         * device. Call to oneAPI runtime is required to determine the device
+         * type, the USM allocation type and the sycl context it is bound to.
+         *
+         */
+        kDLOneAPI = 14,
+        /*! \brief GPU support for next generation WebGPU standard. */
+        kDLWebGPU = 15,
+        /*! \brief Qualcomm Hexagon DSP */
+        kDLHexagon = 16,
+        /*! \brief Microsoft MAIA devices */
+        kDLMAIA = 17,
+        /*! \brief AWS Trainium */
+        kDLTrn = 18,
+    } DLDeviceType;
+
+    /*!
+     * \brief A Device for Tensor and operator.
+     */
+    typedef struct
+    {
+        /*! \brief The device type used in the device. */
+        DLDeviceType device_type;
+        /*!
+         * \brief The device index.
+         * For vanilla CPU memory, pinned memory, or managed memory, this is set
+         * to 0.
+         */
+        int32_t device_id;
+    } DLDevice;
+
+    /*!
+     * \brief The type code options DLDataType.
+     */
+    typedef enum
+    {
+        /*! \brief signed integer */
+        kDLInt = 0U,
+        /*! \brief unsigned integer */
+        kDLUInt = 1U,
+        /*! \brief IEEE floating point */
+        kDLFloat = 2U,
+        /*!
+         * \brief Opaque handle type, reserved for testing purposes.
+         * Frameworks need to agree on the handle data type for the exchange to
+         * be well-defined.
+         */
+        kDLOpaqueHandle = 3U,
+        /*! \brief bfloat16 */
+        kDLBfloat = 4U,
+        /*!
+         * \brief complex number
+         * (C/C++/Python layout: compact struct per complex number)
+         */
+        kDLComplex = 5U,
+        /*! \brief boolean */
+        kDLBool = 6U,
+        /*! \brief FP8 data types */
+        kDLFloat8_e3m4 = 7U,
+        kDLFloat8_e4m3 = 8U,
+        kDLFloat8_e4m3b11fnuz = 9U,
+        kDLFloat8_e4m3fn = 10U,
+        kDLFloat8_e4m3fnuz = 11U,
+        kDLFloat8_e5m2 = 12U,
+        kDLFloat8_e5m2fnuz = 13U,
+        kDLFloat8_e8m0fnu = 14U,
+        /*! \brief FP6 data types
+         * Setting bits != 6 is currently unspecified, and the producer must
+         * ensure it is set while the consumer must stop importing if the value
+         * is unexpected.
+         */
+        kDLFloat6_e2m3fn = 15U,
+        kDLFloat6_e3m2fn = 16U,
+        /*! \brief FP4 data types
+         * Setting bits != 4 is currently unspecified, and the producer must
+         * ensure it is set while the consumer must stop importing if the value
+         * is unexpected.
+         */
+        kDLFloat4_e2m1fn = 17U,
+    } DLDataTypeCode;
+
+    /*!
+     * \brief The data type the tensor can hold. The data type is assumed to
+     * follow the native endian-ness. An explicit error message should be raised
+     * when attempting to export an array with non-native endianness
+     *
+     *  Examples
+     *   - float: type_code = 2, bits = 32, lanes = 1
+     *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+     *   - int8: type_code = 0, bits = 8, lanes = 1
+     *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+     *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library
+     * convention, the underlying storage size of bool is 8 bits)
+     *   - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
+     *   - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
+     *   - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
+     *
+     *  When a sub-byte type is packed, DLPack requires the data to be in little
+     * bit-endian, i.e., for a packed data set D ((D >> (i * bits)) && bit_mask)
+     * stores the i-th element.
+     */
+    typedef struct
+    {
+        /*!
+         * \brief Type code of base types.
+         * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+         * footprint, but the value should be one of DLDataTypeCode enum values.
+         * */
+        uint8_t code;
+        /*!
+         * \brief Number of bits, common choices are 8, 16, 32.
+         */
+        uint8_t bits;
+        /*! \brief Number of lanes in the type, used for vector types. */
+        uint16_t lanes;
+    } DLDataType;
+
+    /*!
+     * \brief Plain C Tensor object, does not manage memory.
+     */
+    typedef struct
+    {
+        /*!
+         * \brief The data pointer points to the allocated data. This will be
+         * CUDA device pointer or cl_mem handle in OpenCL. It may be opaque on
+         * some device types. This pointer is always aligned to 256 bytes as in
+         * CUDA. The `byte_offset` field should be used to point to the
+         * beginning of the data.
+         *
+         * Note that as of Nov 2021, multiple libraries (CuPy, PyTorch,
+         * TensorFlow, TVM, perhaps others) do not adhere to this 256 byte
+         * alignment requirement on CPU/CUDA/ROCm, and always use
+         * `byte_offset=0`.  This must be fixed (after which this note will be
+         * updated); at the moment it is recommended to not rely on the data
+         * pointer being correctly aligned.
+         *
+         * For given DLTensor, the size of memory required to store the contents
+         * of data is calculated as follows:
+         *
+         * \code{.c}
+         * static inline size_t GetDataSize(const DLTensor* t) {
+         *   size_t size = 1;
+         *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+         *     size *= t->shape[i];
+         *   }
+         *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+         *   return size;
+         * }
+         * \endcode
+         *
+         * Note that if the tensor is of size zero, then the data pointer should
+         * be set to `NULL`.
+         */
+        void *data;
+        /*! \brief The device of the tensor */
+        DLDevice device;
+        /*! \brief Number of dimensions */
+        int32_t ndim;
+        /*! \brief The data type of the pointer*/
+        DLDataType dtype;
+        /*!
+         * \brief The shape of the tensor
+         *
+         *  When ndim == 0, shape can be set to NULL.
+         */
+        int64_t *shape;
+        /*!
+         * \brief strides of the tensor (in number of elements, not bytes),
+         *  can not be NULL if ndim != 0, must points to
+         *  an array of ndim elements that specifies the strides,
+         *  so consumer can always rely on strides[dim] being valid for 0 <= dim
+         * < ndim.
+         *
+         *  When ndim == 0, strides can be set to NULL.
+         *
+         *  \note Before DLPack v1.2, strides can be NULL to indicate contiguous
+         * data. This is not allowed in DLPack v1.2 and later. The rationale is
+         * to simplify the consumer handling.
+         */
+        int64_t *strides;
+        /*! \brief The offset in bytes to the beginning pointer to data */
+        uint64_t byte_offset;
+    } DLTensor;
+
+    /*!
+     * \brief C Tensor object, manage memory of DLTensor. This data structure is
+     *  intended to facilitate the borrowing of DLTensor by another framework.
+     * It is not meant to transfer the tensor. When the borrowing framework
+     * doesn't need the tensor, it should call the deleter to notify the host
+     * that the resource is no longer needed.
+     *
+     * \note This data structure is used as Legacy DLManagedTensor
+     *       in DLPack exchange and is deprecated after DLPack v0.8
+     *       Use DLManagedTensorVersioned instead.
+     *       This data structure may get renamed or deleted in future versions.
+     *
+     * \sa DLManagedTensorVersioned
+     */
+    typedef struct DLManagedTensor
+    {
+        /*! \brief DLTensor which is being memory managed */
+        DLTensor dl_tensor;
+        /*! \brief the context of the original host framework of DLManagedTensor
+         * in which DLManagedTensor is used in the framework. It can also be
+         * NULL.
+         */
+        void *manager_ctx;
+        /*!
+         * \brief Destructor - this should be called
+         * to destruct the manager_ctx  which backs the DLManagedTensor. It can
+         * be NULL if there is no way for the caller to provide a reasonable
+         * destructor. The destructor deletes the argument self as well.
+         */
+        void (*deleter)(struct DLManagedTensor *self);
+    } DLManagedTensor;
+
+// bit masks used in the DLManagedTensorVersioned
+
+/*! \brief bit mask to indicate that the tensor is read only. */
+#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
+
+/*!
+ * \brief bit mask to indicate that the tensor is a copy made by the producer.
+ *
+ * If set, the tensor is considered solely owned throughout its lifetime by the
+ * consumer, until the producer-provided deleter is invoked.
+ */
+#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
+
+/*!
+ * \brief bit mask to indicate that whether a sub-byte type is packed or padded.
+ *
+ * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
+ * be set by the producer to signal that a tensor of sub-byte type is padded.
+ */
+#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
+
+    /*!
+     * \brief A versioned and managed C Tensor object, manage memory of
+     * DLTensor.
+     *
+     * This data structure is intended to facilitate the borrowing of DLTensor
+     * by another framework. It is not meant to transfer the tensor. When the
+     * borrowing framework doesn't need the tensor, it should call the deleter
+     * to notify the host that the resource is no longer needed.
+     *
+     * \note This is the current standard DLPack exchange data structure.
+     */
+    typedef struct DLManagedTensorVersioned
+    {
+        /*!
+         * \brief The API and ABI version of the current managed Tensor
+         */
+        DLPackVersion version;
+        /*!
+         * \brief the context of the original host framework.
+         *
+         * Stores DLManagedTensorVersioned is used in the
+         * framework. It can also be NULL.
+         */
+        void *manager_ctx;
+        /*!
+         * \brief Destructor.
+         *
+         * This should be called to destruct manager_ctx which holds the
+         * DLManagedTensorVersioned. It can be NULL if there is no way for the
+         * caller to provide a reasonable destructor. The destructor deletes the
+         * argument self as well.
+         */
+        void (*deleter)(struct DLManagedTensorVersioned *self);
+        /*!
+         * \brief Additional bitmask flags information about the tensor.
+         *
+         * By default the flags should be set to 0.
+         *
+         * \note Future ABI changes should keep everything until this field
+         *       stable, to ensure that deleter can be correctly called.
+         *
+         * \sa DLPACK_FLAG_BITMASK_READ_ONLY
+         * \sa DLPACK_FLAG_BITMASK_IS_COPIED
+         */
+        uint64_t flags;
+        /*! \brief DLTensor which is being memory managed */
+        DLTensor dl_tensor;
+    } DLManagedTensorVersioned;
+
+    //----------------------------------------------------------------------
+    // DLPack `__dlpack_c_exchange_api__` fast exchange protocol definitions
+    //----------------------------------------------------------------------
+    /*!
+     * \brief Request a producer library to create a new tensor.
+     *
+     * Create a new `DLManagedTensorVersioned` within the context of the
+     * producer library. The allocation is defined via the prototype DLTensor.
+     *
+     * This function is exposed by the framework through the DLPackExchangeAPI.
+     *
+     * \param prototype The prototype DLTensor. Only the dtype, ndim, shape,
+     *        and device fields are used.
+     * \param out The output DLManagedTensorVersioned.
+     * \param error_ctx Context for `SetError`.
+     * \param SetError The function to set the error.
+     * \return The owning DLManagedTensorVersioned* or NULL on failure.
+     *         SetError is called exactly when NULL is returned (the implementer
+     *         must ensure this).
+     * \note - As a C function, must not thrown C++ exceptions.
+     *       - Error propagation via SetError to avoid any direct need
+     *         of Python API. Due to this `SetError` may have to ensure the GIL
+     * is held since it will presumably set a Python error.
+     *
+     * \sa DLPackExchangeAPI
+     */
+    typedef int (*DLPackManagedTensorAllocator)( //
+        DLTensor *prototype,
+        DLManagedTensorVersioned **out,
+        void *error_ctx, //
+        void (*SetError)(void *error_ctx,
+                         const char *kind,
+                         const char *message) //
+    );
+
+    /*!
+     * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned.
+     *
+     * This function does not perform any stream synchronization. The consumer
+     * should query DLPackCurrentWorkStream to get the current work stream and
+     * launch kernels on it.
+     *
+     * This function is exposed by the framework through the DLPackExchangeAPI.
+     *
+     * \param py_object The Python object to convert. Must have the same type
+     *        as the one the `DLPackExchangeAPI` was discovered from.
+     * \param out The output DLManagedTensorVersioned.
+     * \return The owning DLManagedTensorVersioned* or NULL on failure with a
+     *         Python exception set. If the data cannot be described using
+     * DLPack this should be a BufferError if possible. \note - As a C function,
+     * must not thrown C++ exceptions.
+     *
+     * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
+     */
+    typedef int (*DLPackManagedTensorFromPyObjectNoSync)( //
+        void *py_object,                                  //
+        DLManagedTensorVersioned **out                    //
+    );
+
+    /*!
+     * \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor.
+     *
+     * This function provides a faster interface for temporary, non-owning,
+     * exchange. The producer (implementer) still owns the memory of data,
+     * strides, shape. The liveness of the DLTensor and the data it views is
+     * only guaranteed until control is returned.
+     *
+     * This function currently assumes that the producer (implementer) can fill
+     * in the DLTensor shape and strides without the need for temporary
+     * allocations.
+     *
+     * This function does not perform any stream synchronization. The consumer
+     * should query DLPackCurrentWorkStream to get the current work stream and
+     * launch kernels on it.
+     *
+     * This function is exposed by the framework through the DLPackExchangeAPI.
+     *
+     * \param py_object The Python object to convert. Must have the same type
+     *        as the one the `DLPackExchangeAPI` was discovered from.
+     * \param out The output DLTensor, whose space is pre-allocated on stack.
+     * \return 0 on success, -1 on failure with a Python exception set.
+     * \note - As a C function, must not thrown C++ exceptions.
+     *
+     * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
+     */
+    typedef int (*DLPackDLTensorFromPyObjectNoSync)( //
+        void *py_object,                             //
+        DLTensor *out                                //
+    );
+
+    /*!
+     * \brief Obtain the current work stream of a device.
+     *
+     * Obtain the current work stream of a device from the producer framework.
+     * For example, it should map to torch.cuda.current_stream in PyTorch.
+     *
+     * When device_type is kDLCPU, the consumer do not have to query the stream
+     * and the producer can simply return NULL when queried.
+     * The consumer do not have to do anything on stream sync or setting.
+     * So CPU only framework can just provide a dummy implementation that
+     * always set out_current_stream[0] to NULL.
+     *
+     * \param device_type The device type.
+     * \param device_id The device id.
+     * \param out_current_stream The output current work stream.
+     *
+     * \return 0 on success, -1 on failure with a Python exception set.
+     * \note - As a C function, must not thrown C++ exceptions.
+     *
+     * \sa DLPackExchangeAPI
+     */
+    typedef int (*DLPackCurrentWorkStream)( //
+        DLDeviceType device_type,           //
+        int32_t device_id,                  //
+        void **out_current_stream           //
+    );
+
+    /*!
+     * \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray.
+     *
+     * Convert an owning DLManagedTensorVersioned* to the Python tensor of the
+     * producer (implementer) library with the correct type.
+     *
+     * This function does not perform any stream synchronization.
+     *
+     * This function is exposed by the framework through the DLPackExchangeAPI.
+     *
+     * \param tensor The DLManagedTensorVersioned to convert the ownership of
+     * the tensor is stolen. \param out_py_object The output Python object.
+     * \return 0 on success, -1 on failure with a Python exception set.
+     *
+     * \sa DLPackExchangeAPI
+     */
+    typedef int (*DLPackManagedTensorToPyObjectNoSync)( //
+        DLManagedTensorVersioned *tensor,               //
+        void **out_py_object                            //
+    );
+
+    /*!
+     * \brief DLPackExchangeAPI stable header.
+     * \sa DLPackExchangeAPI
+     */
+    typedef struct DLPackExchangeAPIHeader
+    {
+        /*!
+         * \brief The provided DLPack version the consumer must check major
+         * version compatibility before using this struct.
+         */
+        DLPackVersion version;
+        /*!
+         * \brief Optional pointer to an older DLPackExchangeAPI in the chain.
+         *
+         * It must be NULL if the framework does not support older versions.
+         * If the current major version is larger than the one supported by the
+         * consumer, the consumer may walk this to find an earlier supported
+         * version.
+         *
+         * \sa DLPackExchangeAPI
+         */
+        struct DLPackExchangeAPIHeader *prev_api;
+    } DLPackExchangeAPIHeader;
+
+    /*!
+     * \brief Framework-specific function pointers table for DLPack exchange.
+     *
+     * Additionally to `__dlpack__()` we define a C function table sharable by
+     *
+     * Python implementations via `__dlpack_c_exchange_api__`.
+     * This attribute must be set on the type as a Python PyCapsule
+     * with name "dlpack_exchange_api".
+     *
+     * A consumer library may use a pattern such as:
+     *
+     * \code
+     *
+     *  PyObject *api_capsule = PyObject_GetAttrString(
+     *    (PyObject *)Py_TYPE(tensor_obj), "__dlpack_c_exchange_api__")
+     *  );
+     *  if (api_capsule == NULL) { goto handle_error; }
+     *  MyDLPackExchangeAPI *api = (MyDLPackExchangeAPI *)PyCapsule_GetPointer(
+     *    api_capsule, "dlpack_exchange_api"
+     *  );
+     *  Py_DECREF(api_capsule);
+     *  if (api == NULL) { goto handle_error; }
+     *
+     * \endcode
+     *
+     * Note that this must be defined on the type. The consumer should look up
+     * the attribute on the type and may cache the result for each unique type.
+     *
+     * The precise API table is given by:
+     * \code
+     * struct MyDLPackExchangeAPI : public DLPackExchangeAPI {
+     *   MyDLPackExchangeAPI() {
+     *     header.version.major = DLPACK_MAJOR_VERSION;
+     *     header.version.minor = DLPACK_MINOR_VERSION;
+     *     header.prev_version_api = nullptr;
+     *
+     *     managed_tensor_allocator = MyDLPackManagedTensorAllocator;
+     *     managed_tensor_from_py_object_no_sync =
+     * MyDLPackManagedTensorFromPyObjectNoSync;
+     *     managed_tensor_to_py_object_no_sync =
+     * MyDLPackManagedTensorToPyObjectNoSync; dltensor_from_py_object_no_sync =
+     * MyDLPackDLTensorFromPyObjectNoSync; current_work_stream =
+     * MyDLPackCurrentWorkStream;
+     *  }
+     *
+     *  static const DLPackExchangeAPI* Global() {
+     *     static MyDLPackExchangeAPI inst;
+     *     return &inst;
+     *  }
+     * };
+     * \endcode
+     *
+     * Guidelines for leveraging DLPackExchangeAPI:
+     *
+     * There are generally two kinds of consumer needs for DLPack exchange:
+     * - N0: library support, where consumer.kernel(x, y, z) would like to run a
+     * kernel with the data from x, y, z. The consumer is also expected to run
+     * the kernel with the same stream context as the producer. For example,
+     * when x, y, z is torch.Tensor, consumer should query
+     * exchange_api->current_work_stream to get the current stream and launch
+     * the kernel with the same stream. This setup is necessary for no
+     * synchronization in kernel launch and maximum compatibility with CUDA
+     * graph capture in the producer. This is the desirable behavior for library
+     * extension support for frameworks like PyTorch.
+     * - N1: data ingestion and retention
+     *
+     * Note that obj.__dlpack__() API should provide useful ways for N1.
+     * The primary focus of the current DLPackExchangeAPI is to enable faster
+     * exchange N0 with the support of the function pointer current_work_stream.
+     *
+     * Array/Tensor libraries should statically create and initialize this
+     * structure then return a pointer to DLPackExchangeAPI as an int value in
+     * Tensor/Array. The DLPackExchangeAPI* must stay alive throughout the
+     * lifetime of the process.
+     *
+     * One simple way to do so is to create a static instance of
+     * DLPackExchangeAPI within the framework and return a pointer to it. The
+     * following code shows an example to do so in C++. It should also be
+     * reasonably easy to do so in other languages.
+     */
+    typedef struct DLPackExchangeAPI
+    {
+        /*!
+         * \brief The header that remains stable across versions.
+         */
+        DLPackExchangeAPIHeader header;
+        /*!
+         * \brief Producer function pointer for DLPackManagedTensorAllocator
+         *        This function must not be NULL.
+         * \sa DLPackManagedTensorAllocator
+         */
+        DLPackManagedTensorAllocator managed_tensor_allocator;
+        /*!
+         * \brief Producer function pointer for DLPackManagedTensorFromPyObject
+         *        This function must be not NULL.
+         * \sa DLPackManagedTensorFromPyObject
+         */
+        DLPackManagedTensorFromPyObjectNoSync
+            managed_tensor_from_py_object_no_sync;
+        /*!
+         * \brief Producer function pointer for DLPackManagedTensorToPyObject
+         *        This function must be not NULL.
+         * \sa DLPackManagedTensorToPyObjectNoSync
+         */
+        DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync;
+        /*!
+         * \brief Producer function pointer for DLPackDLTensorFromPyObject
+         *        This function can be NULL when the producer does not support
+         * this function. \sa DLPackDLTensorFromPyObjectNoSync
+         */
+        DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync;
+        /*!
+         * \brief Producer function pointer for DLPackCurrentWorkStream
+         *        This function must be not NULL.
+         * \sa DLPackCurrentWorkStream
+         */
+        DLPackCurrentWorkStream current_work_stream;
+    } DLPackExchangeAPI;
+
+#ifdef __cplusplus
+} // DLPACK_EXTERN_C
+#endif
+#endif // DLPACK_DLPACK_H_
diff --git a/dpnp/tensor/libtensor/include/kernels/accumulators.hpp b/dpnp/tensor/libtensor/include/kernels/accumulators.hpp
new file mode 100644
index 000000000000..9449c030ac67
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/accumulators.hpp
@@ -0,0 +1,1427 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for accumulators (cumulative sum, prod, etc.).
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::accumulators
+{
+
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+template <typename T>
+T ceiling_quotient(T n, T m)
+{
+    return (n + m - 1) / m;
+}
+
+template <typename inputT, typename outputT>
+struct NonZeroIndicator
+{
+    constexpr NonZeroIndicator() {}
+
+    outputT operator()(const inputT &val) const
+    {
+        static constexpr outputT out_one(1);
+        static constexpr outputT out_zero(0);
+        static constexpr inputT val_zero(0);
+
+        return (val == val_zero) ? out_zero : out_one;
+    }
+};
+
+template <typename T>
+struct NoOpTransformer
+{
+    constexpr NoOpTransformer() {}
+
+    T operator()(const T &val) const { return val; }
+};
+
+template <typename srcTy, typename dstTy>
+struct CastTransformer
+{
+    constexpr CastTransformer() {}
+
+    dstTy operator()(const srcTy &val) const
+    {
+        using dpctl::tensor::type_utils::convert_impl;
+        return convert_impl<dstTy, srcTy>(val);
+    }
+};
+
+template <typename ScanOpT, typename T>
+struct needs_workaround
+{
+    // workaround needed due to crash in JITing on CPU
+    // remove when CMPLRLLVM-65813 is resolved
+    static constexpr bool value = su_ns::IsSyclLogicalAnd<T, ScanOpT>::value ||
+                                  su_ns::IsSyclLogicalOr<T, ScanOpT>::value;
+};
+
+template <typename BinOpT, typename T>
+struct can_use_inclusive_scan_over_group
+{
+    static constexpr bool value = sycl::has_known_identity<BinOpT, T>::value &&
+                                  !needs_workaround<BinOpT, T>::value;
+};
+
+namespace detail
+{
+template <typename T>
+class stack_t
+{
+    T *src_;
+    std::size_t size_;
+    T *local_scans_;
+
+public:
+    stack_t() : src_{}, size_{}, local_scans_{} {}
+    stack_t(T *src, std::size_t sz, T *local_scans)
+        : src_(src), size_(sz), local_scans_(local_scans)
+    {
+    }
+    ~stack_t() {};
+
+    T *get_src_ptr() const { return src_; }
+
+    std::size_t get_size() const { return size_; }
+
+    T *get_local_scans_ptr() const { return local_scans_; }
+};
+
+template <typename T>
+class stack_strided_t
+{
+    T *src_;
+    std::size_t size_;
+    T *local_scans_;
+    std::size_t local_stride_;
+
+public:
+    stack_strided_t() : src_{}, size_{}, local_scans_{}, local_stride_{} {}
+    stack_strided_t(T *src,
+                    std::size_t sz,
+                    T *local_scans,
+                    std::size_t local_stride)
+        : src_(src), size_(sz), local_scans_(local_scans),
+          local_stride_(local_stride)
+    {
+    }
+    ~stack_strided_t() {};
+
+    T *get_src_ptr() const { return src_; }
+
+    std::size_t get_size() const { return size_; }
+
+    T *get_local_scans_ptr() const { return local_scans_; }
+
+    std::size_t get_local_stride() const { return local_stride_; }
+};
+
+} // end of namespace detail
+
+// Iterative cumulative summation
+
+using nwiT = std::uint32_t;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+class inclusive_scan_iter_local_scan_blocked_krn;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+class inclusive_scan_iter_local_scan_striped_krn;
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial = false>
+sycl::event inclusive_scan_base_step_blocked(
+    sycl::queue &exec_q,
+    const std::uint32_t wg_size,
+    const std::size_t iter_nelems,
+    const std::size_t acc_nelems,
+    const inputT *input,
+    outputT *output,
+    const std::size_t s0,
+    const std::size_t s1,
+    const IterIndexerT &iter_indexer,
+    const InpIndexerT &inp_indexer,
+    const OutIndexerT &out_indexer,
+    TransformerT transformer,
+    const ScanOpT &scan_op,
+    outputT identity,
+    std::size_t &acc_groups,
+    const std::vector<sycl::event> &depends = {})
+{
+    acc_groups = ceiling_quotient<std::size_t>(acc_nelems, n_wi * wg_size);
+
+    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using slmT = sycl::local_accessor<outputT, 1>;
+
+        auto gws = sycl::range<1>(iter_nelems * acc_groups * wg_size);
+        auto lws = sycl::range<1>(wg_size);
+
+        auto ndRange = sycl::nd_range<1>(gws, lws);
+
+        slmT slm_iscan_tmp(lws, cgh);
+
+        using KernelName = inclusive_scan_iter_local_scan_blocked_krn<
+            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
+            TransformerT, ScanOpT, include_initial>;
+
+        cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp =
+                                                      std::move(slm_iscan_tmp)](
+                                                  sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_id(0);
+            const std::size_t lid = it.get_local_id(0);
+
+            const std::uint32_t wg_size = it.get_local_range(0);
+            const std::size_t reduce_chunks = acc_groups * wg_size;
+            const std::size_t iter_gid = gid / reduce_chunks;
+            const std::size_t chunk_gid = gid - (iter_gid * reduce_chunks);
+
+            const std::size_t i = chunk_gid * n_wi;
+            const auto &iter_offsets = iter_indexer(iter_gid);
+            const auto &inp_iter_offset = iter_offsets.get_first_offset();
+            const auto &out_iter_offset = iter_offsets.get_second_offset();
+
+            std::array<outputT, n_wi> local_iscan;
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                const std::size_t i_m_wi = i + m_wi;
+                if constexpr (!include_initial) {
+                    local_iscan[m_wi] =
+                        (i_m_wi < acc_nelems)
+                            ? transformer(input[inp_iter_offset +
+                                                inp_indexer(s0 + s1 * i_m_wi)])
+                            : identity;
+                }
+                else {
+                    // shift input to the left by a single element relative to
+                    // output
+                    local_iscan[m_wi] =
+                        (i_m_wi < acc_nelems && i_m_wi > 0)
+                            ? transformer(
+                                  input[inp_iter_offset +
+                                        inp_indexer((s0 + s1 * i_m_wi) - 1)])
+                            : identity;
+                }
+            }
+
+#pragma unroll
+            for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) {
+                local_iscan[m_wi] =
+                    scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]);
+            }
+            // local_iscan is now result of
+            // inclusive scan of locally stored inputs
+
+            outputT wg_iscan_val;
+            if constexpr (can_use_inclusive_scan_over_group<ScanOpT,
+                                                            outputT>::value) {
+                wg_iscan_val = sycl::inclusive_scan_over_group(
+                    it.get_group(), local_iscan.back(), scan_op, identity);
+            }
+            else {
+                wg_iscan_val = su_ns::custom_inclusive_scan_over_group(
+                    it.get_group(), it.get_sub_group(), slm_iscan_tmp,
+                    local_iscan.back(), identity, scan_op);
+                // ensure all finished reading from SLM, to avoid race condition
+                // with subsequent writes into SLM
+                it.barrier(sycl::access::fence_space::local_space);
+            }
+
+            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
+            it.barrier(sycl::access::fence_space::local_space);
+            const outputT modifier = (lid == 0) ? identity : slm_iscan_tmp[lid];
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                local_iscan[m_wi] = scan_op(local_iscan[m_wi], modifier);
+            }
+
+            const std::size_t start = std::min(i, acc_nelems);
+            const std::size_t end = std::min(i + n_wi, acc_nelems);
+            const nwiT m_max = static_cast<nwiT>(end - start);
+            for (nwiT m_wi = 0; m_wi < m_max; ++m_wi) {
+                output[out_iter_offset + out_indexer(i + m_wi)] =
+                    local_iscan[m_wi];
+            }
+        });
+    });
+
+    return inc_scan_phase1_ev;
+}
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial = false>
+sycl::event inclusive_scan_base_step_striped(
+    sycl::queue &exec_q,
+    const std::uint32_t wg_size,
+    const std::size_t iter_nelems,
+    const std::size_t acc_nelems,
+    const inputT *input,
+    outputT *output,
+    const std::size_t s0,
+    const std::size_t s1,
+    const IterIndexerT &iter_indexer,
+    const InpIndexerT &inp_indexer,
+    const OutIndexerT &out_indexer,
+    TransformerT transformer,
+    const ScanOpT &scan_op,
+    outputT identity,
+    std::size_t &acc_groups,
+    const std::vector<sycl::event> &depends = {})
+{
+    const std::uint32_t reduce_nelems_per_wg = n_wi * wg_size;
+    acc_groups =
+        ceiling_quotient<std::size_t>(acc_nelems, reduce_nelems_per_wg);
+
+    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using slmT = sycl::local_accessor<outputT, 1>;
+
+        const auto &gRange = sycl::range<1>{iter_nelems * acc_groups * wg_size};
+        const auto &lRange = sycl::range<1>{wg_size};
+
+        const auto &ndRange = sycl::nd_range<1>{gRange, lRange};
+
+        slmT slm_iscan_tmp(reduce_nelems_per_wg, cgh);
+
+        using KernelName = inclusive_scan_iter_local_scan_striped_krn<
+            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
+            TransformerT, ScanOpT, include_initial>;
+
+        cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp =
+                                                      std::move(slm_iscan_tmp)](
+                                                  sycl::nd_item<1> it) {
+            const std::uint32_t lid = it.get_local_linear_id();
+            const std::uint32_t wg_size = it.get_local_range(0);
+
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t sgroup_id = sg.get_group_id()[0];
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+
+            const std::size_t flat_group_id = it.get_group(0);
+            const std::size_t iter_gid = flat_group_id / acc_groups;
+            const std::size_t acc_group_id =
+                flat_group_id - (iter_gid * acc_groups);
+
+            const auto &iter_offsets = iter_indexer(iter_gid);
+            const auto &inp_iter_offset = iter_offsets.get_first_offset();
+            const auto &out_iter_offset = iter_offsets.get_second_offset();
+
+            std::array<outputT, n_wi> local_iscan{};
+
+            const std::size_t inp_id0 = acc_group_id * n_wi * wg_size +
+                                        sgroup_id * n_wi * sgSize + lane_id;
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                const std::size_t inp_id = inp_id0 + m_wi * sgSize;
+                if constexpr (!include_initial) {
+                    local_iscan[m_wi] =
+                        (inp_id < acc_nelems)
+                            ? transformer(input[inp_iter_offset +
+                                                inp_indexer(s0 + s1 * inp_id)])
+                            : identity;
+                }
+                else {
+                    // shift input to the left by a single element relative to
+                    // output
+                    local_iscan[m_wi] =
+                        (inp_id < acc_nelems && inp_id > 0)
+                            ? transformer(
+                                  input[inp_iter_offset +
+                                        inp_indexer((s0 + s1 * inp_id) - 1)])
+                            : identity;
+                }
+            }
+
+            // change layout from striped to blocked
+            {
+                {
+                    const std::uint32_t local_offset0 = lid * n_wi;
+#pragma unroll
+                    for (std::uint32_t i = 0; i < n_wi; ++i) {
+                        slm_iscan_tmp[local_offset0 + i] = local_iscan[i];
+                    }
+
+                    it.barrier(sycl::access::fence_space::local_space);
+                }
+
+                {
+                    const std::uint32_t block_offset =
+                        sgroup_id * sgSize * n_wi;
+                    const std::uint32_t disp0 = lane_id * n_wi;
+#pragma unroll
+                    for (nwiT i = 0; i < n_wi; ++i) {
+                        const std::uint32_t disp = disp0 + i;
+
+                        // disp == lane_id1 + i1 * sgSize;
+                        const std::uint32_t i1 = disp / sgSize;
+                        const std::uint32_t lane_id1 = disp - i1 * sgSize;
+
+                        const std::uint32_t disp_exchanged =
+                            (lane_id1 * n_wi + i1);
+
+                        local_iscan[i] =
+                            slm_iscan_tmp[block_offset + disp_exchanged];
+                    }
+
+                    it.barrier(sycl::access::fence_space::local_space);
+                }
+            }
+
+#pragma unroll
+            for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) {
+                local_iscan[m_wi] =
+                    scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]);
+            }
+            // local_iscan is now result of
+            // inclusive scan of locally stored inputs
+
+            outputT wg_iscan_val;
+            if constexpr (can_use_inclusive_scan_over_group<ScanOpT,
+                                                            outputT>::value) {
+                wg_iscan_val = sycl::inclusive_scan_over_group(
+                    it.get_group(), local_iscan.back(), scan_op, identity);
+            }
+            else {
+                wg_iscan_val = su_ns::custom_inclusive_scan_over_group(
+                    it.get_group(), sg, slm_iscan_tmp, local_iscan.back(),
+                    identity, scan_op);
+                // ensure all finished reading from SLM, to avoid race condition
+                // with subsequent writes into SLM
+                it.barrier(sycl::access::fence_space::local_space);
+            }
+
+            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
+            it.barrier(sycl::access::fence_space::local_space);
+            const outputT modifier = (lid == 0) ? identity : slm_iscan_tmp[lid];
+
+#pragma unroll
+            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                local_iscan[m_wi] = scan_op(local_iscan[m_wi], modifier);
+            }
+
+            it.barrier(sycl::access::fence_space::local_space);
+
+            // convert back to blocked layout
+            {
+                {
+                    const std::uint32_t local_offset0 = lid * n_wi;
+#pragma unroll
+                    for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                        slm_iscan_tmp[local_offset0 + m_wi] = local_iscan[m_wi];
+                    }
+
+                    it.barrier(sycl::access::fence_space::local_space);
+                }
+            }
+
+            {
+                const std::uint32_t block_offset =
+                    sgroup_id * sgSize * n_wi + lane_id;
+#pragma unroll
+                for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
+                    const std::uint32_t m_wi_scaled = m_wi * sgSize;
+                    const std::size_t out_id = inp_id0 + m_wi_scaled;
+                    if (out_id < acc_nelems) {
+                        output[out_iter_offset + out_indexer(out_id)] =
+                            slm_iscan_tmp[block_offset + m_wi_scaled];
+                    }
+                }
+            }
+        });
+    });
+
+    return inc_scan_phase1_ev;
+}
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial = false>
+sycl::event
+    inclusive_scan_base_step(sycl::queue &exec_q,
+                             const std::uint32_t wg_size,
+                             const std::size_t iter_nelems,
+                             const std::size_t acc_nelems,
+                             const inputT *input,
+                             outputT *output,
+                             const std::size_t s0,
+                             const std::size_t s1,
+                             const IterIndexerT &iter_indexer,
+                             const InpIndexerT &inp_indexer,
+                             const OutIndexerT &out_indexer,
+                             TransformerT transformer,
+                             const ScanOpT &scan_op,
+                             outputT identity,
+                             std::size_t &acc_groups,
+                             const std::vector<sycl::event> &depends = {})
+{
+    // For small stride use striped load/store.
+    // Threshold value chosen experimentally.
+    if (s1 <= 16) {
+        return inclusive_scan_base_step_striped<
+            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
+            TransformerT, ScanOpT, include_initial>(
+            exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1,
+            iter_indexer, inp_indexer, out_indexer, transformer, scan_op,
+            identity, acc_groups, depends);
+    }
+    else {
+        return inclusive_scan_base_step_blocked<
+            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
+            TransformerT, ScanOpT, include_initial>(
+            exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1,
+            iter_indexer, inp_indexer, out_indexer, transformer, scan_op,
+            identity, acc_groups, depends);
+    }
+}
+
+template <typename outputT, nwiT n_wi, typename ScanOpT>
+class inclusive_scan_1d_iter_chunk_update_krn;
+
+template <typename UpdateKernelName,
+          typename outputT,
+          nwiT n_wi,
+          typename ScanOpT>
+sycl::event update_local_chunks_1d(sycl::queue &exec_q,
+                                   outputT *src,
+                                   std::size_t src_size,
+                                   const outputT *local_scans,
+                                   std::size_t chunk_size,
+                                   const sycl::event &dependent_event)
+{
+    const auto &ctx = exec_q.get_context();
+    const auto &dev = exec_q.get_device();
+
+    const auto &kernel_id = sycl::get_kernel_id<UpdateKernelName>();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    // output[ chunk_size * (i + 1) + j] += temp[i]
+    sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_event);
+        cgh.use_kernel_bundle(kb);
+
+        static constexpr nwiT updates_per_wi = n_wi;
+        const std::size_t n_items =
+            ceiling_quotient<std::size_t>(src_size, sg_size * n_wi) * sg_size;
+
+        sycl::range<1> gRange{n_items};
+        sycl::range<1> lRange{sg_size};
+        sycl::nd_range<1> ndRange{gRange, lRange};
+
+        cgh.parallel_for<UpdateKernelName>(
+            ndRange,
+            [chunk_size, src, src_size, local_scans](sycl::nd_item<1> ndit) {
+                static constexpr ScanOpT scan_op{};
+                static constexpr outputT identity =
+                    su_ns::Identity<ScanOpT, outputT>::value;
+
+                const std::uint32_t lws = ndit.get_local_range(0);
+                const std::size_t block_offset = ndit.get_group(0) * n_wi * lws;
+#pragma unroll
+                for (std::size_t i = 0; i < updates_per_wi; ++i) {
+                    const std::size_t src_id =
+                        block_offset + ndit.get_local_id(0) + i * lws;
+                    if (src_id < src_size) {
+                        const std::size_t scan_id = (src_id / chunk_size);
+                        const outputT modifier =
+                            (scan_id > 0) ? local_scans[scan_id - 1] : identity;
+                        src[src_id] = scan_op(src[src_id], modifier);
+                    }
+                }
+            });
+    });
+
+    return update_event;
+}
+
+/*
+ * output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
+ * for 0 <= j < n_elems
+ */
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename IndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
+                                   const std::uint32_t wg_size,
+                                   const std::size_t n_elems,
+                                   const inputT *input,
+                                   outputT *output,
+                                   const std::size_t s0,
+                                   const std::size_t s1,
+                                   const IndexerT &indexer,
+                                   const TransformerT &transformer,
+                                   std::vector<sycl::event> &host_tasks,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    static constexpr ScanOpT scan_op{};
+    static constexpr outputT identity =
+        su_ns::Identity<ScanOpT, outputT>::value;
+
+    static constexpr std::size_t _iter_nelems = 1;
+
+    using IterIndexerT = dpctl::tensor::offset_utils::TwoZeroOffsets_Indexer;
+    static constexpr IterIndexerT _no_op_iter_indexer{};
+
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr NoOpIndexerT _no_op_indexer{};
+
+    std::size_t n_groups;
+    sycl::event inc_scan_phase1_ev =
+        inclusive_scan_base_step<inputT, outputT, n_wi, IterIndexerT, IndexerT,
+                                 NoOpIndexerT, TransformerT, ScanOpT,
+                                 include_initial>(
+            exec_q, wg_size, _iter_nelems, n_elems, input, output, s0, s1,
+            _no_op_iter_indexer, indexer, _no_op_indexer, transformer, scan_op,
+            identity, n_groups, depends);
+
+    sycl::event dependent_event = inc_scan_phase1_ev;
+    if (n_groups > 1) {
+        const std::size_t chunk_size = wg_size * n_wi;
+
+        // how much of temporary allocation do we need
+        std::size_t n_groups_ = n_groups;
+        std::size_t temp_size = 0;
+        while (n_groups_ > 1) {
+            const std::size_t this_size = (n_groups_ - 1);
+            temp_size += this_size;
+            n_groups_ = ceiling_quotient(this_size, chunk_size);
+        }
+
+        // allocate
+        auto temp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<outputT>(temp_size,
+                                                                     exec_q);
+        outputT *temp = temp_owner.get();
+
+        std::vector<detail::stack_t<outputT>> stack{};
+
+        // inclusive scans over blocks
+        n_groups_ = n_groups;
+        outputT *src = output;
+        outputT *local_scans = temp;
+
+        using NoOpTransformerT = NoOpTransformer<outputT>;
+        static constexpr NoOpTransformerT _no_op_transformer{};
+        std::size_t size_to_update = n_elems;
+        while (n_groups_ > 1) {
+
+            const std::size_t src_size = n_groups_ - 1;
+            dependent_event =
+                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
+                                         NoOpIndexerT, NoOpIndexerT,
+                                         NoOpTransformerT, ScanOpT>(
+                    exec_q, wg_size, _iter_nelems, src_size, src, local_scans,
+                    chunk_size - 1, chunk_size, _no_op_iter_indexer,
+                    _no_op_indexer, _no_op_indexer, _no_op_transformer, scan_op,
+                    identity, n_groups_, // n_groups_ is modified in place
+                    {dependent_event});
+            stack.push_back({src, size_to_update, local_scans});
+            src = local_scans;
+            local_scans += src_size;
+            size_to_update = src_size;
+        }
+
+        for (std::size_t reverse_stack_id = 0; reverse_stack_id < stack.size();
+             ++reverse_stack_id) {
+            const std::size_t stack_id = stack.size() - 1 - reverse_stack_id;
+
+            const auto &stack_elem = stack[stack_id];
+            outputT *src = stack_elem.get_src_ptr();
+            const std::size_t src_size = stack_elem.get_size();
+            const outputT *local_scans = stack_elem.get_local_scans_ptr();
+
+            using UpdateKernelName =
+                class inclusive_scan_1d_iter_chunk_update_krn<outputT, n_wi,
+                                                              ScanOpT>;
+
+            dependent_event = update_local_chunks_1d<UpdateKernelName, outputT,
+                                                     n_wi, ScanOpT>(
+                exec_q, src, src_size, local_scans, chunk_size,
+                dependent_event);
+        }
+
+        sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {dependent_event}, temp_owner);
+
+        host_tasks.push_back(free_ev);
+    }
+
+    return dependent_event;
+}
+
+typedef sycl::event (*accumulate_1d_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename srcT,
+          typename dstT,
+          typename transformerT,
+          typename AccumulateOpT,
+          bool include_initial>
+sycl::event
+    accumulate_1d_contig_impl(sycl::queue &q,
+                              std::size_t n_elems,
+                              const char *src,
+                              char *dst,
+                              std::vector<sycl::event> &host_tasks,
+                              const std::vector<sycl::event> &depends = {})
+{
+    const srcT *src_data_ptr = reinterpret_cast<const srcT *>(src);
+    dstT *dst_data_ptr = reinterpret_cast<dstT *>(dst);
+
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr NoOpIndexerT flat_indexer{};
+    static constexpr transformerT transformer{};
+
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+
+    sycl::event comp_ev;
+    const sycl::device &dev = q.get_device();
+    if (dev.has(sycl::aspect::cpu)) {
+        static constexpr nwiT n_wi_for_cpu = 8;
+        const std::uint32_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_cpu, NoOpIndexerT,
+                                         transformerT, AccumulateOpT,
+                                         include_initial>(
+            q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        static constexpr nwiT n_wi_for_gpu = 4;
+        // base_scan_striped algorithm does not execute correctly
+        // on HIP device with wg_size > 64
+        const std::uint32_t wg_size =
+            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
+        comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_gpu, NoOpIndexerT,
+                                         transformerT, AccumulateOpT,
+                                         include_initial>(
+            q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    return comp_ev;
+}
+
+template <typename outputT,
+          nwiT n_wi,
+          typename IterIndexerT,
+          typename IndexerT,
+          typename ScanOpT>
+class inclusive_scan_final_chunk_update_krn;
+
+template <typename UpdateKernelName,
+          typename outputT,
+          nwiT n_wi,
+          typename OutIterIndexerT,
+          typename OutIndexerT,
+          typename ScanOpT>
+sycl::event final_update_local_chunks(sycl::queue &exec_q,
+                                      std::size_t iter_nelems,
+                                      outputT *src,
+                                      std::size_t src_size,
+                                      const outputT *local_scans,
+                                      std::size_t chunk_size,
+                                      std::size_t local_stride,
+                                      const OutIterIndexerT &out_iter_indexer,
+                                      const OutIndexerT &out_indexer,
+                                      sycl::event dependent_event)
+{
+    const auto &kernel_id = sycl::get_kernel_id<UpdateKernelName>();
+
+    auto const &ctx = exec_q.get_context();
+    auto const &dev = exec_q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    static constexpr nwiT updates_per_wi = n_wi;
+    const std::size_t updates_per_sg = sg_size * updates_per_wi;
+    const std::size_t update_nelems =
+        ceiling_quotient(src_size, updates_per_sg) * sg_size;
+
+    sycl::range<2> gRange{iter_nelems, update_nelems};
+    sycl::range<2> lRange{1, sg_size};
+
+    sycl::nd_range<2> ndRange{gRange, lRange};
+
+    sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_event);
+
+        cgh.parallel_for<UpdateKernelName>(
+            ndRange, [chunk_size, src_size, local_stride, src, local_scans,
+                      out_iter_indexer, out_indexer](sycl::nd_item<2> ndit) {
+                static constexpr ScanOpT scan_op{};
+                static constexpr outputT identity =
+                    su_ns::Identity<ScanOpT, outputT>::value;
+
+                const std::uint32_t lws = ndit.get_local_range(1);
+
+                const std::size_t iter_gid = ndit.get_group(0);
+
+                const std::size_t src_axis_id0 =
+                    ndit.get_group(1) * updates_per_wi * lws +
+                    ndit.get_local_id(1);
+                const std::size_t src_iter_id = out_iter_indexer(iter_gid);
+#pragma unroll
+                for (nwiT i = 0; i < updates_per_wi; ++i) {
+                    const std::size_t src_axis_id = src_axis_id0 + i * lws;
+                    const std::size_t src_id =
+                        out_indexer(src_axis_id) + src_iter_id;
+
+                    if (src_axis_id < src_size) {
+                        const std::size_t scan_axis_id =
+                            src_axis_id / chunk_size;
+                        const std::size_t scan_id =
+                            scan_axis_id + iter_gid * local_stride;
+
+                        const outputT modifier = (scan_axis_id > 0)
+                                                     ? local_scans[scan_id - 1]
+                                                     : identity;
+
+                        src[src_id] = scan_op(src[src_id], modifier);
+                    }
+                }
+            });
+    });
+
+    return update_event;
+}
+
+template <typename outputT, nwiT n_wi, typename ScanOpT>
+class inclusive_scan_iter_chunk_update_krn;
+
+template <typename UpdateKernelName,
+          typename outputT,
+          nwiT n_wi,
+          typename ScanOpT>
+sycl::event update_local_chunks(sycl::queue &exec_q,
+                                std::size_t iter_nelems,
+                                outputT *src,
+                                std::size_t src_size,
+                                const outputT *local_scans,
+                                std::size_t chunk_size,
+                                std::size_t local_stride,
+                                sycl::event dependent_event)
+{
+    static constexpr NoOpIndexer out_indexer{};
+    static constexpr NoOpIndexer iter_out_indexer{};
+
+    return final_update_local_chunks<UpdateKernelName, outputT, n_wi,
+                                     NoOpIndexer, NoOpIndexer, ScanOpT>(
+        exec_q, iter_nelems, src, src_size, local_scans, chunk_size,
+        local_stride, iter_out_indexer, out_indexer, dependent_event);
+}
+
+template <typename inputT,
+          typename outputT,
+          nwiT n_wi,
+          typename InpIterIndexerT,
+          typename OutIterIndexerT,
+          typename InpIndexerT,
+          typename OutIndexerT,
+          typename TransformerT,
+          typename ScanOpT,
+          bool include_initial>
+sycl::event inclusive_scan_iter(sycl::queue &exec_q,
+                                const std::uint32_t wg_size,
+                                const std::size_t iter_nelems,
+                                const std::size_t acc_nelems,
+                                const inputT *input,
+                                outputT *output,
+                                const std::size_t s0,
+                                const std::size_t s1,
+                                const InpIterIndexerT &inp_iter_indexer,
+                                const OutIterIndexerT &out_iter_indexer,
+                                const InpIndexerT &inp_indexer,
+                                const OutIndexerT &out_indexer,
+                                const TransformerT &transformer,
+                                std::vector<sycl::event> &host_tasks,
+                                const std::vector<sycl::event> &depends = {})
+{
+    static constexpr ScanOpT scan_op{};
+    static constexpr outputT identity =
+        su_ns::Identity<ScanOpT, outputT>::value;
+
+    using IterIndexerT =
+        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+            InpIterIndexerT, OutIterIndexerT>;
+    const IterIndexerT iter_indexer{inp_iter_indexer, out_iter_indexer};
+
+    std::size_t acc_groups;
+    sycl::event inc_scan_phase1_ev =
+        inclusive_scan_base_step<inputT, outputT, n_wi, IterIndexerT,
+                                 InpIndexerT, OutIndexerT, TransformerT,
+                                 ScanOpT, include_initial>(
+            exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1,
+            iter_indexer, inp_indexer, out_indexer, transformer, scan_op,
+            identity, acc_groups, depends);
+
+    sycl::event dependent_event = inc_scan_phase1_ev;
+    if (acc_groups > 1) {
+        const std::size_t chunk_size = wg_size * n_wi;
+
+        // how much of temporary allocation do we need
+        std::size_t acc_groups_ = acc_groups;
+        std::size_t temp_size = 0;
+        while (acc_groups_ > 1) {
+            const std::size_t this_size = (acc_groups_ - 1);
+            temp_size += this_size;
+            acc_groups_ = ceiling_quotient<std::size_t>(this_size, chunk_size);
+        }
+
+        // allocate
+        auto temp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<outputT>(
+                iter_nelems * temp_size, exec_q);
+        outputT *temp = temp_owner.get();
+
+        std::vector<detail::stack_strided_t<outputT>> stack{};
+
+        // inclusive scans over blocks
+        acc_groups_ = acc_groups;
+        outputT *src = output;
+        outputT *local_scans = temp;
+
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        static constexpr NoOpIndexerT _no_op_indexer{};
+        using NoOpTransformerT = NoOpTransformer<outputT>;
+        static constexpr NoOpTransformerT _no_op_transformer{};
+        std::size_t size_to_update = acc_nelems;
+
+        {
+            std::size_t src_size = acc_groups - 1;
+            using LocalScanIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            const LocalScanIndexerT scan_iter_indexer{/* size */ iter_nelems,
+                                                      /* step */ src_size};
+
+            using IterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    OutIterIndexerT, LocalScanIndexerT>;
+            const IterIndexerT iter_indexer_{out_iter_indexer,
+                                             scan_iter_indexer};
+
+            dependent_event =
+                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
+                                         OutIndexerT, NoOpIndexerT,
+                                         NoOpTransformerT, ScanOpT>(
+                    exec_q, wg_size, iter_nelems, src_size, src, local_scans,
+                    chunk_size - 1, chunk_size, iter_indexer_, out_indexer,
+                    _no_op_indexer, _no_op_transformer, scan_op, identity,
+                    acc_groups_, // acc_groups_ is modified in place
+                    {dependent_event});
+            stack.push_back({src, size_to_update, local_scans, src_size});
+            src = local_scans;
+            local_scans += src_size * iter_nelems;
+            size_to_update = src_size;
+        }
+
+        while (acc_groups_ > 1) {
+            std::size_t src_size = acc_groups_ - 1;
+
+            using LocalScanIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            const LocalScanIndexerT scan1_iter_indexer{
+                /* size */ iter_nelems,
+                /* step */ size_to_update};
+            const LocalScanIndexerT scan2_iter_indexer{/* size */ iter_nelems,
+                                                       /* step */ src_size};
+
+            using IterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    LocalScanIndexerT, LocalScanIndexerT>;
+            const IterIndexerT iter_indexer_{scan1_iter_indexer,
+                                             scan2_iter_indexer};
+
+            dependent_event =
+                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
+                                         NoOpIndexerT, NoOpIndexerT,
+                                         NoOpTransformerT, ScanOpT>(
+                    exec_q, wg_size, iter_nelems, src_size, src, local_scans,
+                    chunk_size - 1, chunk_size, iter_indexer_, _no_op_indexer,
+                    _no_op_indexer, _no_op_transformer, scan_op, identity,
+                    acc_groups_, // acc_groups_ is modified in place
+                    {dependent_event});
+            stack.push_back({src, size_to_update, local_scans, src_size});
+            src = local_scans;
+            local_scans += src_size * iter_nelems;
+            size_to_update = src_size;
+        }
+
+        for (std::size_t reverse_stack_id = 0;
+             reverse_stack_id < stack.size() - 1; ++reverse_stack_id) {
+            const std::size_t stack_id = stack.size() - 1 - reverse_stack_id;
+
+            const auto &stack_elem = stack[stack_id];
+            outputT *src = stack_elem.get_src_ptr();
+            std::size_t src_size = stack_elem.get_size();
+            outputT *local_scans = stack_elem.get_local_scans_ptr();
+            std::size_t local_stride = stack_elem.get_local_stride();
+
+            using UpdateKernelName =
+                class inclusive_scan_iter_chunk_update_krn<outputT, n_wi,
+                                                           ScanOpT>;
+
+            dependent_event =
+                update_local_chunks<UpdateKernelName, outputT, n_wi, ScanOpT>(
+                    exec_q, iter_nelems, src, src_size, local_scans, chunk_size,
+                    local_stride, dependent_event);
+        }
+
+        // last stack element is always directly to output
+        {
+            const auto &stack_elem = stack[0];
+            outputT *src = stack_elem.get_src_ptr();
+            const std::size_t src_size = stack_elem.get_size();
+            outputT *local_scans = stack_elem.get_local_scans_ptr();
+            const std::size_t local_stride = stack_elem.get_local_stride();
+
+            using UpdateKernelName =
+                class inclusive_scan_final_chunk_update_krn<
+                    outputT, n_wi, OutIterIndexerT, OutIndexerT, ScanOpT>;
+
+            dependent_event =
+                final_update_local_chunks<UpdateKernelName, outputT, n_wi,
+                                          OutIterIndexerT, OutIndexerT,
+                                          ScanOpT>(
+                    exec_q, iter_nelems, src, src_size, local_scans, chunk_size,
+                    local_stride, out_iter_indexer, out_indexer,
+                    dependent_event);
+        }
+
+        sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {dependent_event}, temp_owner);
+        host_tasks.push_back(free_ev);
+    }
+
+    return dependent_event;
+}
+
+typedef sycl::event (*accumulate_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename srcT,
+          typename dstT,
+          typename transformerT,
+          typename AccumulateOpT,
+          bool include_initial>
+sycl::event
+    accumulate_strided_impl(sycl::queue &q,
+                            std::size_t iter_nelems,
+                            std::size_t acc_nelems,
+                            const char *src,
+                            int iter_nd,
+                            const ssize_t *iter_shape_strides,
+                            ssize_t inp_iter_offset,
+                            ssize_t out_iter_offset,
+                            int acc_nd,
+                            const ssize_t *acc_shape_strides,
+                            char *dst,
+                            std::vector<sycl::event> &host_tasks,
+                            const std::vector<sycl::event> &depends = {})
+{
+    const srcT *src_data_ptr = reinterpret_cast<const srcT *>(src);
+    dstT *dst_data_ptr = reinterpret_cast<dstT *>(dst);
+
+    using InpIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const InpIndexerT inp_axis_indexer{acc_nd, 0, acc_shape_strides};
+    const InpIndexerT inp_iter_indexer{iter_nd, inp_iter_offset,
+                                       iter_shape_strides};
+
+    using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+    const OutIndexerT out_axis_indexer{acc_nd, 0, acc_shape_strides,
+                                       acc_shape_strides + 2 * acc_nd};
+    const OutIndexerT out_iter_indexer{iter_nd, out_iter_offset,
+                                       iter_shape_strides,
+                                       iter_shape_strides + 2 * iter_nd};
+
+    static constexpr transformerT transformer{};
+
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+
+    const sycl::device &dev = q.get_device();
+    sycl::event comp_ev;
+    if (dev.has(sycl::aspect::cpu)) {
+        static constexpr nwiT n_wi_for_cpu = 8;
+        const std::uint32_t wg_size = 256;
+        comp_ev =
+            inclusive_scan_iter<srcT, dstT, n_wi_for_cpu, InpIndexerT,
+                                OutIndexerT, InpIndexerT, OutIndexerT,
+                                transformerT, AccumulateOpT, include_initial>(
+                q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr,
+                s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer,
+                out_axis_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        static constexpr nwiT n_wi_for_gpu = 4;
+        // base_scan_striped algorithm does not execute correctly
+        // on HIP device with wg_size > 64
+        const std::uint32_t wg_size =
+            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
+        comp_ev =
+            inclusive_scan_iter<srcT, dstT, n_wi_for_gpu, InpIndexerT,
+                                OutIndexerT, InpIndexerT, OutIndexerT,
+                                transformerT, AccumulateOpT, include_initial>(
+                q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr,
+                s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer,
+                out_axis_indexer, transformer, host_tasks, depends);
+    }
+
+    return comp_ev;
+}
+
+typedef std::size_t (*cumsum_val_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename maskT, typename cumsumT, typename transformerT>
+std::size_t cumsum_val_contig_impl(sycl::queue &q,
+                                   std::size_t n_elems,
+                                   const char *mask,
+                                   char *cumsum,
+                                   std::vector<sycl::event> &host_tasks,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
+    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
+
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr NoOpIndexerT flat_indexer{};
+    static constexpr transformerT transformer{};
+
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+    static constexpr bool include_initial = false;
+    using AccumulateOpT = sycl::plus<cumsumT>;
+
+    sycl::event comp_ev;
+    const sycl::device &dev = q.get_device();
+    if (dev.has(sycl::aspect::cpu)) {
+        static constexpr nwiT n_wi_for_cpu = 8;
+        const std::uint32_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
+                                         NoOpIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        static constexpr nwiT n_wi_for_gpu = 4;
+        // base_scan_striped algorithm does not execute correctly
+        // on HIP device with wg_size > 64
+        const std::uint32_t wg_size =
+            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
+                                         NoOpIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            flat_indexer, transformer, host_tasks, depends);
+    }
+    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
+
+    auto host_usm_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_host<cumsumT>(1, q);
+    cumsumT *last_elem_host_usm = host_usm_owner.get();
+
+    sycl::event copy_e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        cgh.copy<cumsumT>(last_elem, last_elem_host_usm, 1);
+    });
+    copy_e.wait();
+    std::size_t return_val = static_cast<std::size_t>(*last_elem_host_usm);
+
+    // explicitly free USM host allocation, by invoking deleter of
+    // the unique_ptr
+    host_usm_owner.reset(nullptr);
+
+    return return_val;
+}
+
+template <typename fnT, typename T>
+struct MaskPositionsContigFactoryForInt32
+{
+    fnT get()
+    {
+        using cumsumT = std::int32_t;
+        fnT fn =
+            cumsum_val_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPositionsContigFactoryForInt64
+{
+    fnT get()
+    {
+        using cumsumT = std::int64_t;
+        fnT fn =
+            cumsum_val_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct Cumsum1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral_v<T>) {
+            using cumsumT = std::int64_t;
+            fnT fn =
+                cumsum_val_contig_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
+            return fn;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+typedef std::size_t (*cumsum_val_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    int,
+    const ssize_t *,
+    char *,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+template <typename maskT, typename cumsumT, typename transformerT>
+std::size_t
+    cumsum_val_strided_impl(sycl::queue &q,
+                            std::size_t n_elems,
+                            const char *mask,
+                            int nd,
+                            const ssize_t *shape_strides,
+                            char *cumsum,
+                            std::vector<sycl::event> &host_tasks,
+                            const std::vector<sycl::event> &depends = {})
+{
+    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
+    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
+
+    using StridedIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const StridedIndexerT strided_indexer{nd, 0, shape_strides};
+    static constexpr transformerT transformer{};
+
+    static constexpr std::size_t s0 = 0;
+    static constexpr std::size_t s1 = 1;
+    static constexpr bool include_initial = false;
+    using AccumulateOpT = sycl::plus<cumsumT>;
+
+    const sycl::device &dev = q.get_device();
+    sycl::event comp_ev;
+    if (dev.has(sycl::aspect::cpu)) {
+        static constexpr nwiT n_wi_for_cpu = 8;
+        const std::uint32_t wg_size = 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
+                                         StridedIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            strided_indexer, transformer, host_tasks, depends);
+    }
+    else {
+        static constexpr nwiT n_wi_for_gpu = 4;
+        // base_scan_striped algorithm does not execute correctly
+        // on HIP device with wg_size > 64
+        const std::uint32_t wg_size =
+            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
+        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
+                                         StridedIndexerT, transformerT,
+                                         AccumulateOpT, include_initial>(
+            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
+            strided_indexer, transformer, host_tasks, depends);
+    }
+
+    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
+
+    auto host_usm_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_host<cumsumT>(1, q);
+    cumsumT *last_elem_host_usm = host_usm_owner.get();
+
+    sycl::event copy_e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        cgh.copy<cumsumT>(last_elem, last_elem_host_usm, 1);
+    });
+    copy_e.wait();
+    std::size_t return_val = static_cast<std::size_t>(*last_elem_host_usm);
+
+    // explicitly free USM-host temporary, by invoking deleter of
+    // the unique_ptr
+    host_usm_owner.reset(nullptr);
+
+    return return_val;
+}
+
+template <typename fnT, typename T>
+struct MaskPositionsStridedFactoryForInt32
+{
+    fnT get()
+    {
+        using cumsumT = std::int32_t;
+        fnT fn =
+            cumsum_val_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPositionsStridedFactoryForInt64
+{
+    fnT get()
+    {
+        using cumsumT = std::int64_t;
+        fnT fn =
+            cumsum_val_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct Cumsum1DStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral_v<T>) {
+            using cumsumT = std::int64_t;
+            fnT fn =
+                cumsum_val_strided_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
+            return fn;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::accumulators
diff --git a/dpnp/tensor/libtensor/include/kernels/alignment.hpp b/dpnp/tensor/libtensor/include/kernels/alignment.hpp
new file mode 100644
index 000000000000..a67e9b15306e
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/alignment.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace dpctl::tensor::kernels::alignment_utils
+{
+inline constexpr std::size_t required_alignment = 64UL;
+
+template <std::uintptr_t alignment, typename Ptr>
+bool is_aligned(Ptr p)
+{
+    return !(reinterpret_cast<std::uintptr_t>(p) % alignment);
+}
+
+template <typename KernelName>
+class disabled_sg_loadstore_wrapper_krn;
+} // namespace dpctl::tensor::kernels::alignment_utils
diff --git a/dpnp/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpnp/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
new file mode 100644
index 000000000000..046ad87d7d78
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -0,0 +1,853 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for advanced tensor index operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::indexing
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+template <typename OrthogIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT,
+          typename LocalAccessorT>
+struct MaskedExtractStridedFunctor
+{
+    MaskedExtractStridedFunctor(const dataT *src_data_p,
+                                const indT *cumsum_data_p,
+                                dataT *dst_data_p,
+                                std::size_t masked_iter_size,
+                                const OrthogIndexerT &orthog_src_dst_indexer_,
+                                const MaskedSrcIndexerT &masked_src_indexer_,
+                                const MaskedDstIndexerT &masked_dst_indexer_,
+                                const LocalAccessorT &lacc_)
+        : src(src_data_p), cumsum(cumsum_data_p), dst(dst_data_p),
+          masked_nelems(masked_iter_size),
+          orthog_src_dst_indexer(orthog_src_dst_indexer_),
+          masked_src_indexer(masked_src_indexer_),
+          masked_dst_indexer(masked_dst_indexer_), lacc(lacc_)
+    {
+        static_assert(
+            std::is_same_v<indT, typename LocalAccessorT::value_type>);
+    }
+
+    void operator()(sycl::nd_item<2> ndit) const
+    {
+        const std::size_t orthog_i = ndit.get_global_id(0);
+        const std::uint32_t l_i = ndit.get_local_id(1);
+        const std::uint32_t lws = ndit.get_local_range(1);
+
+        const std::size_t masked_i = ndit.get_global_id(1);
+        const std::size_t masked_block_start = masked_i - l_i;
+
+        const std::size_t max_offset = masked_nelems + 1;
+        for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
+            const std::size_t offset = masked_block_start + i;
+            lacc[i] = (offset == 0)           ? indT(0)
+                      : (offset < max_offset) ? cumsum[offset - 1]
+                                              : cumsum[masked_nelems - 1] + 1;
+        }
+
+        sycl::group_barrier(ndit.get_group());
+
+        const indT current_running_count = lacc[l_i + 1];
+        const bool mask_set = (masked_i == 0)
+                                  ? (current_running_count == 1)
+                                  : (current_running_count == lacc[l_i] + 1);
+
+        // dst[cumsum[i] - 1, j] = src[i, j]
+        //     if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1)
+        if (mask_set && (masked_i < masked_nelems)) {
+            const auto &orthog_offsets = orthog_src_dst_indexer(orthog_i);
+
+            const std::size_t total_src_offset =
+                masked_src_indexer(masked_i) +
+                orthog_offsets.get_first_offset();
+            const std::size_t total_dst_offset =
+                masked_dst_indexer(current_running_count - 1) +
+                orthog_offsets.get_second_offset();
+
+            dst[total_dst_offset] = src[total_src_offset];
+        }
+    }
+
+private:
+    const dataT *src = nullptr;
+    const indT *cumsum = nullptr;
+    dataT *dst = nullptr;
+    std::size_t masked_nelems = 0;
+    // has nd, shape, src_strides, dst_strides for
+    // dimensions that ARE NOT masked
+    OrthogIndexerT orthog_src_dst_indexer;
+    // has nd, shape, src_strides for
+    // dimensions that ARE masked
+    MaskedSrcIndexerT masked_src_indexer;
+    // has 1, dst_strides for dimensions that ARE masked
+    MaskedDstIndexerT masked_dst_indexer;
+    LocalAccessorT lacc;
+};
+
+template <typename OrthogIndexerT,
+          typename MaskedDstIndexerT,
+          typename MaskedRhsIndexerT,
+          typename dataT,
+          typename indT,
+          typename LocalAccessorT>
+struct MaskedPlaceStridedFunctor
+{
+    MaskedPlaceStridedFunctor(dataT *dst_data_p,
+                              const indT *cumsum_data_p,
+                              const dataT *rhs_data_p,
+                              std::size_t masked_iter_size,
+                              const OrthogIndexerT &orthog_dst_rhs_indexer_,
+                              const MaskedDstIndexerT &masked_dst_indexer_,
+                              const MaskedRhsIndexerT &masked_rhs_indexer_,
+                              const LocalAccessorT &lacc_)
+        : dst(dst_data_p), cumsum(cumsum_data_p), rhs(rhs_data_p),
+          masked_nelems(masked_iter_size),
+          orthog_dst_rhs_indexer(orthog_dst_rhs_indexer_),
+          masked_dst_indexer(masked_dst_indexer_),
+          masked_rhs_indexer(masked_rhs_indexer_), lacc(lacc_)
+    {
+        static_assert(
+            std::is_same_v<indT, typename LocalAccessorT::value_type>);
+    }
+
+    void operator()(sycl::nd_item<2> ndit) const
+    {
+        const std::size_t orthog_i = ndit.get_global_id(0);
+        const std::uint32_t l_i = ndit.get_local_id(1);
+        const std::uint32_t lws = ndit.get_local_range(1);
+
+        const std::size_t masked_i = ndit.get_global_id(1);
+        const std::size_t masked_block_start = masked_i - l_i;
+
+        const std::size_t max_offset = masked_nelems + 1;
+        for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
+            const std::size_t offset = masked_block_start + i;
+            lacc[i] = (offset == 0)           ? indT(0)
+                      : (offset < max_offset) ? cumsum[offset - 1]
+                                              : cumsum[masked_nelems - 1] + 1;
+        }
+
+        sycl::group_barrier(ndit.get_group());
+
+        const indT current_running_count = lacc[l_i + 1];
+        const bool mask_set = (masked_i == 0)
+                                  ? (current_running_count == 1)
+                                  : (current_running_count == lacc[l_i] + 1);
+
+        // src[i, j] = rhs[cumsum[i] - 1, j]
+        // if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1)
+        if (mask_set && (masked_i < masked_nelems)) {
+            const auto &orthog_offsets = orthog_dst_rhs_indexer(orthog_i);
+
+            const std::size_t total_dst_offset =
+                masked_dst_indexer(masked_i) +
+                orthog_offsets.get_first_offset();
+            const std::size_t total_rhs_offset =
+                masked_rhs_indexer(current_running_count - 1) +
+                orthog_offsets.get_second_offset();
+
+            dst[total_dst_offset] = rhs[total_rhs_offset];
+        }
+    }
+
+private:
+    dataT *dst = nullptr;
+    const indT *cumsum = nullptr;
+    const dataT *rhs = nullptr;
+    std::size_t masked_nelems = 0;
+    // has nd, shape, dst_strides, rhs_strides for
+    // dimensions that ARE NOT masked
+    OrthogIndexerT orthog_dst_rhs_indexer;
+    // has nd, shape, dst_strides for
+    // dimensions that ARE masked
+    MaskedDstIndexerT masked_dst_indexer;
+    // has 1, rhs_strides for dimensions that ARE masked
+    MaskedRhsIndexerT masked_rhs_indexer;
+    LocalAccessorT lacc;
+};
+
+// ======= Masked extraction ================================
+
+namespace detail
+{
+
+template <std::size_t I, std::size_t... IR>
+std::size_t _get_lws_impl(std::size_t n)
+{
+    if constexpr (sizeof...(IR) == 0) {
+        return I;
+    }
+    else {
+        return (n < I) ? _get_lws_impl<IR...>(n) : I;
+    }
+}
+
+inline std::size_t get_lws(std::size_t n)
+{
+    static constexpr std::size_t lws0 = 256u;
+    static constexpr std::size_t lws1 = 128u;
+    static constexpr std::size_t lws2 = 64u;
+    return _get_lws_impl<lws0, lws1, lws2>(n);
+}
+
+} // end of namespace detail
+
+template <typename MaskedDstIndexerT, typename dataT, typename indT>
+class masked_extract_all_slices_contig_impl_krn;
+
+typedef sycl::event (*masked_extract_all_slices_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    const char *,
+    const char *,
+    char *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename dataT, typename indT>
+sycl::event masked_extract_all_slices_contig_impl(
+    sycl::queue &exec_q,
+    ssize_t iteration_size,
+    const char *src_p,
+    const char *cumsum_p,
+    char *dst_p,
+    ssize_t dst_size, // dst is 1D
+    ssize_t dst_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
+
+    static constexpr NoOpIndexer masked_src_indexer{};
+    const Strided1DIndexer masked_dst_indexer(/* size */ dst_size,
+                                              /* step */ dst_stride);
+
+    using KernelName =
+        class masked_extract_all_slices_contig_impl_krn<Strided1DIndexer, dataT,
+                                                        indT>;
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        struct MaskedExtractStridedFunctor<TwoZeroOffsets_Indexer, NoOpIndexer,
+                                           Strided1DIndexer, dataT, indT,
+                                           LocalAccessorT>;
+
+    const std::size_t masked_extent = iteration_size;
+
+    const std::size_t lws = detail::get_lws(masked_extent);
+
+    const std::size_t n_groups = (iteration_size + lws - 1) / lws;
+
+    sycl::range<2> gRange{1, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+
+    sycl::nd_range<2> ndRange(gRange, lRange);
+
+    const dataT *src_tp = reinterpret_cast<const dataT *>(src_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(lws, masked_extent) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_extent,
+                          orthog_src_dst_indexer, masked_src_indexer,
+                          masked_dst_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+template <typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_extract_all_slices_strided_impl_krn;
+
+typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    const char *,
+    const char *,
+    char *,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename dataT, typename indT>
+sycl::event masked_extract_all_slices_strided_impl(
+    sycl::queue &exec_q,
+    ssize_t iteration_size,
+    const char *src_p,
+    const char *cumsum_p,
+    char *dst_p,
+    int nd,
+    const ssize_t
+        *packed_src_shape_strides, // [src_shape, src_strides], length 2*nd
+    ssize_t dst_size,              // dst is 1D
+    ssize_t dst_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
+
+    /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
+     * *_packed_shape_strides) */
+    const StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides);
+    const Strided1DIndexer masked_dst_indexer(/* size */ dst_size,
+                                              /* step */ dst_stride);
+
+    using KernelName = class masked_extract_all_slices_strided_impl_krn<
+        StridedIndexer, Strided1DIndexer, dataT, indT>;
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        struct MaskedExtractStridedFunctor<TwoZeroOffsets_Indexer,
+                                           StridedIndexer, Strided1DIndexer,
+                                           dataT, indT, LocalAccessorT>;
+
+    const std::size_t masked_nelems = iteration_size;
+
+    const std::size_t lws = detail::get_lws(masked_nelems);
+
+    const std::size_t n_groups = (masked_nelems + lws - 1) / lws;
+
+    sycl::range<2> gRange{1, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+
+    sycl::nd_range<2> ndRange(gRange, lRange);
+
+    const dataT *src_tp = reinterpret_cast<const dataT *>(src_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(lws, masked_nelems) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(src_tp, cumsum_tp, dst_tp, iteration_size,
+                          orthog_src_dst_indexer, masked_src_indexer,
+                          masked_dst_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    ssize_t,
+    const char *,
+    const char *,
+    char *,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename OrthoIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_extract_some_slices_strided_impl_krn;
+
+template <typename dataT, typename indT>
+sycl::event masked_extract_some_slices_strided_impl(
+    sycl::queue &exec_q,
+    ssize_t orthog_nelems,
+    ssize_t masked_nelems,
+    const char *src_p,
+    const char *cumsum_p,
+    char *dst_p,
+    int orthog_nd,
+    // [ortho_shape, ortho_src_strides, // ortho_dst_strides],
+    // length 3*ortho_nd
+    const ssize_t *packed_ortho_src_dst_shape_strides,
+    ssize_t ortho_src_offset,
+    ssize_t ortho_dst_offset,
+    int masked_nd,
+    // [masked_src_shape, masked_src_strides],
+    // length 2*masked_nd, mask_dst is 1D
+    const ssize_t *packed_masked_src_shape_strides,
+    ssize_t masked_dst_size,
+    ssize_t masked_dst_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    const TwoOffsets_StridedIndexer orthog_src_dst_indexer{
+        orthog_nd, ortho_src_offset, ortho_dst_offset,
+        packed_ortho_src_dst_shape_strides};
+
+    const StridedIndexer masked_src_indexer{masked_nd, 0,
+                                            packed_masked_src_shape_strides};
+    const Strided1DIndexer masked_dst_indexer{/* size */ masked_dst_size,
+                                              /* step */ masked_dst_stride};
+
+    using KernelName = class masked_extract_some_slices_strided_impl_krn<
+        TwoOffsets_StridedIndexer, StridedIndexer, Strided1DIndexer, dataT,
+        indT>;
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        struct MaskedExtractStridedFunctor<TwoOffsets_StridedIndexer,
+                                           StridedIndexer, Strided1DIndexer,
+                                           dataT, indT, LocalAccessorT>;
+
+    const std::size_t masked_extent = masked_nelems;
+
+    const std::size_t lws = detail::get_lws(masked_extent);
+
+    const std::size_t n_groups = ((masked_extent + lws - 1) / lws);
+    const std::size_t orthog_extent = static_cast<std::size_t>(orthog_nelems);
+
+    sycl::range<2> gRange{orthog_extent, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+
+    sycl::nd_range<2> ndRange(gRange, lRange);
+
+    const dataT *src_tp = reinterpret_cast<const dataT *>(src_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size =
+            std::min<std::size_t>(lws, masked_extent) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_nelems,
+                          orthog_src_dst_indexer, masked_src_indexer,
+                          masked_dst_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+template <typename fnT, typename T>
+struct MaskExtractAllSlicesContigFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_contig_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractAllSlicesContigFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_contig_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractAllSlicesStridedFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_strided_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractAllSlicesStridedFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_all_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractSomeSlicesStridedFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_some_slices_strided_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskExtractSomeSlicesStridedFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_extract_some_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+// Masked placement
+
+template <typename OrthoIndexerT,
+          typename MaskedDstIndexerT,
+          typename MaskedRhsIndexerT,
+          typename dataT,
+          typename indT>
+class masked_place_all_slices_strided_impl_krn;
+
+typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    char *,
+    const char *,
+    const char *,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename dataT, typename indT>
+sycl::event masked_place_all_slices_strided_impl(
+    sycl::queue &exec_q,
+    ssize_t iteration_size,
+    char *dst_p,
+    const char *cumsum_p,
+    const char *rhs_p,
+    int nd,
+    const ssize_t
+        *packed_dst_shape_strides, // [dst_shape, dst_strides], length 2*nd
+    ssize_t rhs_size,              // rhs is 1D
+    ssize_t rhs_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    static constexpr TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};
+
+    /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
+     * *_packed_shape_strides) */
+    const StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides);
+    const Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride);
+
+    using KernelName = class masked_place_all_slices_strided_impl_krn<
+        TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer, dataT,
+        indT>;
+
+    static constexpr std::size_t nominal_lws = 256;
+    const std::size_t masked_extent = iteration_size;
+    const std::size_t lws = std::min(masked_extent, nominal_lws);
+
+    const std::size_t n_groups = (masked_extent + lws - 1) / lws;
+
+    sycl::range<2> gRange{1, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+    sycl::nd_range<2> ndRange{gRange, lRange};
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        MaskedPlaceStridedFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                  Strided1DCyclicIndexer, dataT, indT,
+                                  LocalAccessorT>;
+
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+    const dataT *rhs_tp = reinterpret_cast<const dataT *>(rhs_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(masked_extent, lws) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, iteration_size,
+                          orthog_dst_rhs_indexer, masked_dst_indexer,
+                          masked_rhs_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    ssize_t,
+    char *,
+    const char *,
+    const char *,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    int,
+    ssize_t const *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename OrthoIndexerT,
+          typename MaskedSrcIndexerT,
+          typename MaskedDstIndexerT,
+          typename dataT,
+          typename indT>
+class masked_place_some_slices_strided_impl_krn;
+
+template <typename dataT, typename indT>
+sycl::event masked_place_some_slices_strided_impl(
+    sycl::queue &exec_q,
+    ssize_t orthog_nelems,
+    ssize_t masked_nelems,
+    char *dst_p,
+    const char *cumsum_p,
+    const char *rhs_p,
+    int orthog_nd,
+    // [ortho_shape, ortho_dst_strides, ortho_rhs_strides],
+    // length 3*ortho_nd
+    const ssize_t *packed_ortho_dst_rhs_shape_strides,
+    ssize_t ortho_dst_offset,
+    ssize_t ortho_rhs_offset,
+    int masked_nd,
+    // [masked_dst_shape, masked_dst_strides],
+    // length 2*masked_nd, mask_dst is 1D
+    const ssize_t *packed_masked_dst_shape_strides,
+    ssize_t masked_rhs_size,
+    ssize_t masked_rhs_stride,
+    const std::vector<sycl::event> &depends = {})
+{
+    const TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{
+        orthog_nd, ortho_dst_offset, ortho_rhs_offset,
+        packed_ortho_dst_rhs_shape_strides};
+
+    /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
+     * *_packed_shape_strides) */
+    const StridedIndexer masked_dst_indexer{masked_nd, 0,
+                                            packed_masked_dst_shape_strides};
+    const Strided1DCyclicIndexer masked_rhs_indexer{0, masked_rhs_size,
+                                                    masked_rhs_stride};
+
+    using KernelName = class masked_place_some_slices_strided_impl_krn<
+        TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer,
+        dataT, indT>;
+
+    static constexpr std::size_t nominal_lws = 256;
+    const std::size_t orthog_extent = orthog_nelems;
+    const std::size_t masked_extent = masked_nelems;
+    const std::size_t lws = std::min(masked_extent, nominal_lws);
+
+    const std::size_t n_groups = (masked_extent + lws - 1) / lws;
+
+    sycl::range<2> gRange{orthog_extent, n_groups * lws};
+    sycl::range<2> lRange{1, lws};
+    sycl::nd_range<2> ndRange{gRange, lRange};
+
+    using LocalAccessorT = sycl::local_accessor<indT, 1>;
+    using Impl =
+        MaskedPlaceStridedFunctor<TwoOffsets_StridedIndexer, StridedIndexer,
+                                  Strided1DCyclicIndexer, dataT, indT,
+                                  LocalAccessorT>;
+
+    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
+    const dataT *rhs_tp = reinterpret_cast<const dataT *>(rhs_p);
+    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(masked_extent, lws) + 1;
+        LocalAccessorT lacc(lacc_size, cgh);
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, masked_nelems,
+                          orthog_dst_rhs_indexer, masked_dst_indexer,
+                          masked_rhs_indexer, lacc));
+    });
+
+    return comp_ev;
+}
+
+template <typename fnT, typename T>
+struct MaskPlaceAllSlicesStridedFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_place_all_slices_strided_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPlaceAllSlicesStridedFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_place_all_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPlaceSomeSlicesStridedFactoryForInt32
+{
+    fnT get()
+    {
+        fnT fn = masked_place_some_slices_strided_impl<T, std::int32_t>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct MaskPlaceSomeSlicesStridedFactoryForInt64
+{
+    fnT get()
+    {
+        fnT fn = masked_place_some_slices_strided_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+// Non-zero
+
+template <typename T1, typename T2>
+class non_zero_indexes_krn;
+
+typedef sycl::event (*non_zero_indexes_fn_ptr_t)(
+    sycl::queue &,
+    ssize_t,
+    ssize_t,
+    int,
+    const char *,
+    char *,
+    const ssize_t *,
+    std::vector<sycl::event> const &);
+
+template <typename indT1, typename indT2>
+sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
+                                  ssize_t iter_size,
+                                  ssize_t nz_elems,
+                                  int nd,
+                                  const char *cumsum_cp,
+                                  char *indexes_cp,
+                                  const ssize_t *mask_shape,
+                                  std::vector<sycl::event> const &depends)
+{
+    const indT1 *cumsum_data = reinterpret_cast<const indT1 *>(cumsum_cp);
+    indT2 *indexes_data = reinterpret_cast<indT2 *>(indexes_cp);
+
+    static constexpr std::size_t nominal_lws = 256u;
+    const std::size_t masked_extent = iter_size;
+    const std::size_t lws = std::min(masked_extent, nominal_lws);
+
+    const std::size_t n_groups = (masked_extent + lws - 1) / lws;
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::range<1> lRange{lws};
+
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lacc_size = std::min(lws, masked_extent) + 1;
+        sycl::local_accessor<indT1, 1> lacc(lacc_size, cgh);
+
+        using KernelName = class non_zero_indexes_krn<indT1, indT2>;
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
+            const std::size_t group_i = ndit.get_group(0);
+            const std::uint32_t l_i = ndit.get_local_id(0);
+            const std::uint32_t lws = ndit.get_local_range(0);
+
+            const std::size_t masked_block_start = group_i * lws;
+
+            for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
+                const std::size_t offset = masked_block_start + i;
+                lacc[i] = (offset == 0) ? indT1(0)
+                          : (offset - 1 < masked_extent)
+                              ? cumsum_data[offset - 1]
+                              : cumsum_data[masked_extent - 1] + 1;
+            }
+
+            sycl::group_barrier(ndit.get_group());
+
+            const std::size_t i = masked_block_start + l_i;
+            const auto cs_val = lacc[l_i];
+            const bool cond = (lacc[l_i + 1] == cs_val + 1);
+
+            if (cond && (i < masked_extent)) {
+                ssize_t i_ = static_cast<ssize_t>(i);
+                for (int dim = nd; --dim > 0;) {
+                    const auto sd = mask_shape[dim];
+                    const ssize_t q = i_ / sd;
+                    const ssize_t r = (i_ - q * sd);
+                    indexes_data[cs_val + dim * nz_elems] =
+                        static_cast<indT2>(r);
+                    i_ = q;
+                }
+                indexes_data[cs_val] = static_cast<indT2>(i_);
+            }
+        });
+    });
+
+    return comp_ev;
+}
+
+} // namespace dpctl::tensor::kernels::indexing
diff --git a/dpnp/tensor/libtensor/include/kernels/clip.hpp b/dpnp/tensor/libtensor/include/kernels/clip.hpp
new file mode 100644
index 000000000000..900fcf3df100
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/clip.hpp
@@ -0,0 +1,356 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for dpctl.tensor.clip.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::clip
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename T>
+T clip(const T &x, const T &min, const T &max)
+{
+    using dpctl::tensor::type_utils::is_complex;
+    if constexpr (is_complex<T>::value) {
+        using dpctl::tensor::math_utils::max_complex;
+        using dpctl::tensor::math_utils::min_complex;
+        return min_complex(max_complex(x, min), max);
+    }
+    else if constexpr (std::is_floating_point_v<T> ||
+                       std::is_same_v<T, sycl::half>) {
+        auto tmp = (std::isnan(x) || x > min) ? x : min;
+        return (std::isnan(tmp) || tmp < max) ? tmp : max;
+    }
+    else if constexpr (std::is_same_v<T, bool>) {
+        return (x || min) && max;
+    }
+    else {
+        auto tmp = (x > min) ? x : min;
+        return (tmp < max) ? tmp : max;
+    }
+}
+
+template <typename T,
+          std::uint8_t vec_sz = 4,
+          std::uint8_t n_vecs = 2,
+          bool enable_sg_loadstore = true>
+class ClipContigFunctor
+{
+private:
+    std::size_t nelems = 0;
+    const T *x_p = nullptr;
+    const T *min_p = nullptr;
+    const T *max_p = nullptr;
+    T *dst_p = nullptr;
+
+public:
+    ClipContigFunctor(std::size_t nelems_,
+                      const T *x_p_,
+                      const T *min_p_,
+                      const T *max_p_,
+                      T *dst_p_)
+        : nelems(nelems_), x_p(x_p_), min_p(min_p_), max_p(max_p_),
+          dst_p(dst_p_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+            const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi;
+
+            const std::size_t start =
+                (gid / sgSize) * (nelems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + nelems_per_sg);
+
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]);
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                 sg.get_group_id()[0] * sgSize);
+
+            if (base + nelems_per_wi * sgSize < nelems) {
+                sycl::vec<T, vec_sz> dst_vec;
+#pragma unroll
+                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    const std::size_t idx = base + it * sgSize;
+                    auto x_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x_p[idx]);
+                    auto min_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&min_p[idx]);
+                    auto max_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&max_p[idx]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[idx]);
+
+                    const sycl::vec<T, vec_sz> x_vec =
+                        sub_group_load<vec_sz>(sg, x_multi_ptr);
+                    const sycl::vec<T, vec_sz> min_vec =
+                        sub_group_load<vec_sz>(sg, min_multi_ptr);
+                    const sycl::vec<T, vec_sz> max_vec =
+                        sub_group_load<vec_sz>(sg, max_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
+                        dst_vec[vec_id] = clip(x_vec[vec_id], min_vec[vec_id],
+                                               max_vec[vec_id]);
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems; k += sgSize) {
+                    dst_p[k] = clip(x_p[k], min_p[k], max_p[k]);
+                }
+            }
+        }
+    }
+};
+
+template <typename T, int vec_sz, int n_vecs>
+class clip_contig_kernel;
+
+typedef sycl::event (*clip_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event clip_contig_impl(sycl::queue &q,
+                             std::size_t nelems,
+                             const char *x_cp,
+                             const char *min_cp,
+                             const char *max_cp,
+                             char *dst_cp,
+                             const std::vector<sycl::event> &depends)
+{
+    const T *x_tp = reinterpret_cast<const T *>(x_cp);
+    const T *min_tp = reinterpret_cast<const T *>(min_cp);
+    const T *max_tp = reinterpret_cast<const T *>(max_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        std::size_t lws = 64;
+        static constexpr std::uint8_t vec_sz = 4;
+        static constexpr std::uint8_t n_vecs = 2;
+        const std::size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        if (is_aligned<required_alignment>(x_cp) &&
+            is_aligned<required_alignment>(min_cp) &&
+            is_aligned<required_alignment>(max_cp) &&
+            is_aligned<required_alignment>(dst_cp)) {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
+            using Impl =
+                ClipContigFunctor<T, vec_sz, n_vecs, enable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(nelems, x_tp, min_tp, max_tp, dst_tp));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+            using Impl =
+                ClipContigFunctor<T, vec_sz, n_vecs, disable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(nelems, x_tp, min_tp, max_tp, dst_tp));
+        }
+    });
+
+    return clip_ev;
+}
+
+template <typename T, typename IndexerT>
+class ClipStridedFunctor
+{
+private:
+    const T *x_p = nullptr;
+    const T *min_p = nullptr;
+    const T *max_p = nullptr;
+    T *dst_p = nullptr;
+    IndexerT indexer;
+
+public:
+    ClipStridedFunctor(const T *x_p_,
+                       const T *min_p_,
+                       const T *max_p_,
+                       T *dst_p_,
+                       const IndexerT &indexer_)
+        : x_p(x_p_), min_p(min_p_), max_p(max_p_), dst_p(dst_p_),
+          indexer(indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        std::size_t gid = id[0];
+        auto offsets = indexer(static_cast<ssize_t>(gid));
+        dst_p[offsets.get_fourth_offset()] = clip(
+            x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()],
+            max_p[offsets.get_third_offset()]);
+    }
+};
+
+template <typename T, typename IndexerT>
+class clip_strided_kernel;
+
+typedef sycl::event (*clip_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event clip_strided_impl(sycl::queue &q,
+                              std::size_t nelems,
+                              int nd,
+                              const char *x_cp,
+                              const char *min_cp,
+                              const char *max_cp,
+                              char *dst_cp,
+                              const ssize_t *shape_strides,
+                              ssize_t x_offset,
+                              ssize_t min_offset,
+                              ssize_t max_offset,
+                              ssize_t dst_offset,
+                              const std::vector<sycl::event> &depends)
+{
+    const T *x_tp = reinterpret_cast<const T *>(x_cp);
+    const T *min_tp = reinterpret_cast<const T *>(min_cp);
+    const T *max_tp = reinterpret_cast<const T *>(max_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const FourOffsets_StridedIndexer indexer{
+            nd, x_offset, min_offset, max_offset, dst_offset, shape_strides};
+
+        using KernelName = clip_strided_kernel<T, FourOffsets_StridedIndexer>;
+        using Impl = ClipStridedFunctor<T, FourOffsets_StridedIndexer>;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            Impl(x_tp, min_tp, max_tp, dst_tp, indexer));
+    });
+
+    return clip_ev;
+}
+
+template <typename fnT, typename T>
+struct ClipStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = clip_strided_impl<T>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct ClipContigFactory
+{
+    fnT get()
+    {
+
+        fnT fn = clip_contig_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::clip
diff --git a/dpnp/tensor/libtensor/include/kernels/constructors.hpp b/dpnp/tensor/libtensor/include/kernels/constructors.hpp
new file mode 100644
index 000000000000..67f2502067ca
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/constructors.hpp
@@ -0,0 +1,575 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor constructors.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/strided_iters.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::constructors
+{
+
+using dpctl::tensor::ssize_t;
+
+/*!
+  @defgroup CtorKernels
+ */
+
+template <typename Ty>
+class linear_sequence_step_kernel;
+template <typename Ty, typename wTy>
+class linear_sequence_affine_kernel;
+template <typename Ty>
+class full_strided_kernel;
+template <typename Ty>
+class eye_kernel;
+
+using namespace dpctl::tensor::offset_utils;
+
+template <typename Ty>
+class LinearSequenceStepFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty start_v;
+    Ty step_v;
+
+public:
+    LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), step_v(dv)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        auto i = wiid.get(0);
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<Ty>::value) {
+            p[i] = Ty{start_v.real() + i * step_v.real(),
+                      start_v.imag() + i * step_v.imag()};
+        }
+        else {
+            p[i] = start_v + i * step_v;
+        }
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by typed starting value and
+ * increment.
+ *
+ * @param q  Sycl queue to which the kernel is submitted
+ * @param nelems Length of the sequence
+ * @param start_v Typed starting value of the sequence
+ * @param step_v  Typed increment of the sequence
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_step_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                Ty start_v,
+                                Ty step_v,
+                                char *array_data,
+                                const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+    sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<linear_sequence_step_kernel<Ty>>(
+            sycl::range<1>{nelems},
+            LinearSequenceStepFunctor<Ty>(array_data, start_v, step_v));
+    });
+
+    return lin_space_step_event;
+}
+
+// Constructor to populate tensor with linear sequence defined by
+// start and data
+
+template <typename Ty, typename wTy>
+class LinearSequenceAffineFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty start_v;
+    Ty end_v;
+    std::size_t n;
+
+public:
+    LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1),
+          n((den == 0) ? 1 : den)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        auto i = wiid.get(0);
+        wTy wc = wTy(i) / n;
+        wTy w = wTy(n - i) / n;
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<Ty>::value) {
+            using reT = typename Ty::value_type;
+            auto _w = static_cast<reT>(w);
+            auto _wc = static_cast<reT>(wc);
+            auto re_comb = sycl::fma(start_v.real(), _w, reT(0));
+            re_comb =
+                sycl::fma(end_v.real(), _wc,
+                          re_comb); // start_v.real() * _w + end_v.real() * _wc;
+            auto im_comb =
+                sycl::fma(start_v.imag(), _w,
+                          reT(0)); // start_v.imag() * _w + end_v.imag() * _wc;
+            im_comb = sycl::fma(end_v.imag(), _wc, im_comb);
+            Ty affine_comb = Ty{re_comb, im_comb};
+            p[i] = affine_comb;
+        }
+        else if constexpr (std::is_floating_point<Ty>::value) {
+            Ty _w = static_cast<Ty>(w);
+            Ty _wc = static_cast<Ty>(wc);
+            auto affine_comb =
+                sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc;
+            affine_comb = sycl::fma(end_v, _wc, affine_comb);
+            p[i] = affine_comb;
+        }
+        else {
+            using dpctl::tensor::type_utils::convert_impl;
+            auto affine_comb = start_v * w + end_v * wc;
+            p[i] = convert_impl<Ty, decltype(affine_comb)>(affine_comb);
+        }
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by typed starting and end values.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence.
+ * @param start_v Starting value of the sequence.
+ * @param end_v   End-value of the sequence.
+ * @param include_endpoint  Whether the end-value is included in the sequence.
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_affine_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  Ty start_v,
+                                  Ty end_v,
+                                  bool include_endpoint,
+                                  char *array_data,
+                                  const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+
+    const bool device_supports_doubles =
+        exec_q.get_device().has(sycl::aspect::fp64);
+    const std::size_t den = (include_endpoint) ? nelems - 1 : nelems;
+
+    sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        if (device_supports_doubles) {
+            using KernelName = linear_sequence_affine_kernel<Ty, double>;
+            using Impl = LinearSequenceAffineFunctor<Ty, double>;
+
+            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                         Impl(array_data, start_v, end_v, den));
+        }
+        else {
+            using KernelName = linear_sequence_affine_kernel<Ty, float>;
+            using Impl = LinearSequenceAffineFunctor<Ty, float>;
+
+            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                         Impl(array_data, start_v, end_v, den));
+        }
+    });
+
+    return lin_space_affine_event;
+}
+
+/* ================ Full ================== */
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param fill_v  Value to fill the array with
+ * @param dst_p Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_contig_impl(sycl::queue &q,
+                             std::size_t nelems,
+                             dstTy fill_v,
+                             char *dst_p,
+                             const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    sycl::event fill_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        dstTy *p = reinterpret_cast<dstTy *>(dst_p);
+        cgh.fill<dstTy>(p, fill_v, nelems);
+    });
+
+    return fill_ev;
+}
+
+template <typename Ty, typename IndexerT>
+class FullStridedFunctor
+{
+private:
+    Ty *p = nullptr;
+    Ty fill_v;
+    IndexerT indexer;
+
+public:
+    FullStridedFunctor(Ty *p_, const Ty &fill_v_, const IndexerT &indexer_)
+        : p(p_), fill_v(fill_v_), indexer(indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        auto offset = indexer(id.get(0));
+        p[offset] = fill_v;
+    }
+};
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nd  Array dimensionality
+ * @param nelems  Length of the sequence
+ * @param shape_strides  Kernel accessible USM pointer to packed shape and
+ * strides of array.
+ * @param fill_v  Value to fill the array with
+ * @param dst_p  Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_strided_impl(sycl::queue &q,
+                              int nd,
+                              std::size_t nelems,
+                              const ssize_t *shape_strides,
+                              dstTy fill_v,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+
+    dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+    using dpctl::tensor::offset_utils::StridedIndexer;
+    const StridedIndexer strided_indexer(nd, 0, shape_strides);
+
+    sycl::event fill_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName = full_strided_kernel<dstTy>;
+        using Impl = FullStridedFunctor<dstTy, StridedIndexer>;
+
+        cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                     Impl(dst_tp, fill_v, strided_indexer));
+    });
+
+    return fill_ev;
+}
+
+/* ================ Eye ================== */
+
+typedef sycl::event (*eye_fn_ptr_t)(sycl::queue &,
+                                    std::size_t nelems, // num_elements
+                                    ssize_t start,
+                                    ssize_t end,
+                                    ssize_t step,
+                                    char *, // dst_data_ptr
+                                    const std::vector<sycl::event> &);
+
+template <typename Ty>
+class EyeFunctor
+{
+private:
+    Ty *p = nullptr;
+    ssize_t start_v;
+    ssize_t end_v;
+    ssize_t step_v;
+
+public:
+    EyeFunctor(char *dst_p,
+               const ssize_t v0,
+               const ssize_t v1,
+               const ssize_t dv)
+        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1), step_v(dv)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        Ty set_v = 0;
+        ssize_t i = static_cast<ssize_t>(wiid.get(0));
+        if (i >= start_v and i <= end_v) {
+            if ((i - start_v) % step_v == 0) {
+                set_v = 1;
+            }
+        }
+        p[i] = set_v;
+    }
+};
+
+/*!
+ * @brief Function to populate 2D array with eye matrix.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Number of elements to assign.
+ * @param start   Position of the first non-zero value.
+ * @param end     Position of the last non-zero value.
+ * @param step    Number of array elements between non-zeros.
+ * @param array_data Kernel accessible USM pointer for the destination array.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return  Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event eye_impl(sycl::queue &exec_q,
+                     std::size_t nelems,
+                     const ssize_t start,
+                     const ssize_t end,
+                     const ssize_t step,
+                     char *array_data,
+                     const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+    sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName = eye_kernel<Ty>;
+        using Impl = EyeFunctor<Ty>;
+
+        cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
+                                     Impl(array_data, start, end, step));
+    });
+
+    return eye_event;
+}
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct EyeFactory
+{
+    fnT get()
+    {
+        fnT f = eye_impl<Ty>;
+        return f;
+    }
+};
+
+/* =========================== Tril and triu ============================== */
+
+// define function type
+typedef sycl::event (*tri_fn_ptr_t)(sycl::queue &,
+                                    ssize_t,   // inner_range  //ssize_t
+                                    ssize_t,   // outer_range
+                                    char *,    // src_data_ptr
+                                    char *,    // dst_data_ptr
+                                    ssize_t,   // nd
+                                    ssize_t *, // shape_and_strides
+                                    ssize_t,   // k
+                                    const std::vector<sycl::event> &,
+                                    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy triangular matrices from source stack to destination
+ * stack.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param inner_range  Number of elements in each matrix.
+ * @param outer_range  Number of matrices to copy.
+ * @param src_p  Kernel accessible USM pointer for the source array.
+ * @param dst_p  Kernel accessible USM pointer for the destination array.
+ * @param nd  The array dimensionality of source and destination arrays.
+ * @param shape_and_strides  Kernel accessible USM pointer to packed shape and
+ * strides of arrays.
+ * @param k Position of the diagonal above/below which to copy filling the rest
+ * with zero elements.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ * @param additional_depends  List of additional events to wait for before
+ * starting computations, if any.
+ *
+ * @return  Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty, bool>
+class tri_kernel;
+template <typename Ty, bool upper>
+sycl::event tri_impl(sycl::queue &exec_q,
+                     ssize_t inner_range,
+                     ssize_t outer_range,
+                     char *src_p,
+                     char *dst_p,
+                     ssize_t nd,
+                     ssize_t *shape_and_strides,
+                     ssize_t k,
+                     const std::vector<sycl::event> &depends,
+                     const std::vector<sycl::event> &additional_depends)
+{
+    static constexpr int d2 = 2;
+    ssize_t src_s = nd;
+    ssize_t dst_s = 2 * nd;
+    ssize_t nd_1 = nd - 1;
+    ssize_t nd_2 = nd - 2;
+    Ty *src = reinterpret_cast<Ty *>(src_p);
+    Ty *dst = reinterpret_cast<Ty *>(dst_p);
+
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
+
+    sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        cgh.parallel_for<tri_kernel<Ty, upper>>(
+            sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) {
+                ssize_t outer_gid = idx[0] / inner_range;
+                ssize_t inner_gid = idx[0] - inner_range * outer_gid;
+
+                ssize_t src_inner_offset = 0, dst_inner_offset = 0;
+                bool to_copy{false};
+
+                {
+                    using dpctl::tensor::strides::CIndexer_array;
+                    CIndexer_array<d2, ssize_t> indexer_i(
+                        {shape_and_strides[nd_2], shape_and_strides[nd_1]});
+                    indexer_i.set(inner_gid);
+                    const std::array<ssize_t, d2> &inner = indexer_i.get();
+                    src_inner_offset =
+                        inner[0] * shape_and_strides[src_s + nd_2] +
+                        inner[1] * shape_and_strides[src_s + nd_1];
+                    dst_inner_offset =
+                        inner[0] * shape_and_strides[dst_s + nd_2] +
+                        inner[1] * shape_and_strides[dst_s + nd_1];
+
+                    if constexpr (upper)
+                        to_copy = (inner[0] + k >= inner[1]);
+                    else
+                        to_copy = (inner[0] + k <= inner[1]);
+                }
+
+                ssize_t src_offset = 0;
+                ssize_t dst_offset = 0;
+                {
+                    using dpctl::tensor::strides::CIndexer_vector;
+                    CIndexer_vector<ssize_t> outer(nd - d2);
+                    outer.get_displacement(
+                        outer_gid, shape_and_strides, shape_and_strides + src_s,
+                        shape_and_strides + dst_s, src_offset, dst_offset);
+                }
+
+                src_offset += src_inner_offset;
+                dst_offset += dst_inner_offset;
+
+                dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0);
+            });
+    });
+    return tri_ev;
+}
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct TrilGenericFactory
+{
+    fnT get()
+    {
+        fnT f = tri_impl<Ty, /*tril*/ true>;
+        return f;
+    }
+};
+
+/*!
+ * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
+ * @ingroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct TriuGenericFactory
+{
+    fnT get()
+    {
+        fnT f = tri_impl<Ty, /*triu*/ false>;
+        return f;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::constructors
diff --git a/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp
new file mode 100644
index 000000000000..2c4146d467e6
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -0,0 +1,1273 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor copying and value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <vector>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::copy_and_cast
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename srcT, typename dstT, typename IndexerT>
+class copy_cast_generic_kernel;
+
+template <typename srcT,
+          typename dstT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class copy_cast_contig_kernel;
+
+template <typename srcT, typename dstT, typename IndexerT>
+class copy_cast_from_host_kernel;
+
+template <typename srcT, typename dstT, typename IndexerT>
+class copy_cast_from_host_contig_kernel;
+
+template <typename srcTy, typename dstTy>
+class Caster
+{
+public:
+    Caster() = default;
+    dstTy operator()(const srcTy &src) const
+    {
+        using dpctl::tensor::type_utils::convert_impl;
+        return convert_impl<dstTy, srcTy>(src);
+    }
+};
+
+template <typename srcT, typename dstT, typename CastFnT, typename IndexerT>
+class GenericCopyFunctor
+{
+private:
+    const srcT *src_ = nullptr;
+    dstT *dst_ = nullptr;
+    IndexerT indexer_;
+
+public:
+    GenericCopyFunctor(const srcT *src_p, dstT *dst_p, const IndexerT &indexer)
+        : src_(src_p), dst_(dst_p), indexer_(indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const auto &offsets = indexer_(static_cast<ssize_t>(wiid.get(0)));
+        const ssize_t &src_offset = offsets.get_first_offset();
+        const ssize_t &dst_offset = offsets.get_second_offset();
+
+        static constexpr CastFnT fn{};
+        dst_[dst_offset] = fn(src_[src_offset]);
+    }
+};
+
+/*!
+  @defgroup CopyAndCastKernels
+ */
+
+/*!
+ * @brief Function pointer type for generic array cast and copying function.
+ */
+typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to
+ `dst` usm_ndarray while casting from `srcTy` to `dstTy`.
+
+   Both arrays have array dimensionality specified via argument `nd`. The
+ `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the
+ first `nd` elements encode common shape, second `nd` elements contain strides
+ of `src` array, and the trailing `nd` elements contain strides of `dst` array.
+ `src_p` and `dst_p` represent pointers into respective arrays, but the start of
+ iteration begins at offset of `src_offset` elements for `src` array and at
+ offset `dst_offset` elements for `dst` array. Kernel is submitted to sycl queue
+ `q` with events `depends` and `additional_depends` as dependencies.
+
+   @param  q       Sycl queue to which the kernel is submitted.
+   @param  nelems  Number of elements to cast and copy.
+   @param  nd      Array dimensionality, i.e. number of indices needed to
+ identify an element of each array.
+   @param  shape_and_strides  Kernel accessible USM pointer to packed shape and
+ strides.
+   @param  src_p   Kernel accessible USM pointer for the source array
+   @param  src_offset  Offset to the beginning of iteration in number of
+ elements of source array from `src_p`.
+   @param  dst_p   Kernel accessible USM pointer for the destination array
+   @param  dst_offset  Offset to the beginning of iteration in number of
+ elements of destination array from `dst_p`.
+   @param  depends  List of events to wait for before starting computations, if
+ any.
+   @param  additional_depends Additional list of events to wait for before
+ starting computations, if any.
+
+   @return  Event to wait on to ensure that computation completes.
+   @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+sycl::event copy_and_cast_generic_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        const TwoOffsets_StridedIndexer indexer{nd, src_offset, dst_offset,
+                                                shape_and_strides};
+        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_p);
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+        cgh.parallel_for<class copy_cast_generic_kernel<
+            srcTy, dstTy, TwoOffsets_StridedIndexer>>(
+            sycl::range<1>(nelems),
+            GenericCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>,
+                               TwoOffsets_StridedIndexer>(src_tp, dst_tp,
+                                                          indexer));
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get generic function pointer of type `fnT` for given source
+ * data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastGenericFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_generic_impl<D, S>;
+        return f;
+    }
+};
+
+// Specialization of copy_and_cast for contiguous arrays
+
+template <typename srcT,
+          typename dstT,
+          typename CastFnT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+class ContigCopyFunctor
+{
+private:
+    std::size_t nelems;
+    const srcT *src_p = nullptr;
+    dstT *dst_p = nullptr;
+
+public:
+    ContigCopyFunctor(const std::size_t nelems_,
+                      const srcT *src_p_,
+                      dstT *dst_p_)
+        : nelems(nelems_), src_p(src_p_), dst_p(dst_p_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr CastFnT fn{};
+
+        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+
+        using dpctl::tensor::type_utils::is_complex_v;
+        if constexpr (!enable_sg_loadstore || is_complex_v<srcT> ||
+                      is_complex_v<dstT>) {
+            std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+
+            // start = (gid / sgSize) * elems_per_sg + (gid % sgSize)
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + elems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                dst_p[offset] = fn(src_p[offset]);
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems) {
+                sycl::vec<dstT, vec_sz> dst_vec;
+
+#pragma unroll
+                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto src_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&src_p[offset]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[offset]);
+
+                    const sycl::vec<srcT, vec_sz> src_vec =
+                        sub_group_load<vec_sz>(sg, src_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; k++) {
+                        dst_vec[k] = fn(src_vec[k]);
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t start = base + sg.get_local_id()[0];
+                for (std::size_t k = start; k < nelems; k += sgSize) {
+                    dst_p[k] = fn(src_p[k]);
+                }
+            }
+        }
+    }
+};
+
+/*!
+ * @brief Function pointer type for contiguous array cast and copy function.
+ */
+typedef sycl::event (*copy_and_cast_contig_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy `nelems` elements from contiguous `src` usm_ndarray
+ to contiguous `dst` usm_ndarray while casting from `srcTy` to `dstTy`.
+
+   Both arrays have the same number of elements `nelems`.
+ `src_cp` and `dst_cp` represent char pointers to the start of respective
+ arrays. Kernel is submitted to sycl queue `q` with events `depends` as
+ dependencies.
+
+   @param  q       Sycl queue to which the kernel is submitted.
+   @param  nelems  Number of elements to cast and copy.
+   @param  src_p   Kernel accessible USM pointer for the source array
+   @param  dst_p   Kernel accessible USM pointer for the destination array
+   @param  depends  List of events to wait for before starting computations, if
+ any.
+
+   @return  Event to wait on to ensure that computation completes.
+   @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+sycl::event copy_and_cast_contig_impl(sycl::queue &q,
+                                      std::size_t nelems,
+                                      const char *src_cp,
+                                      char *dst_cp,
+                                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_cp);
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_cp);
+
+        std::size_t lws = 64;
+        static constexpr std::uint32_t vec_sz = 4;
+        static constexpr std::uint32_t n_vecs = 2;
+        const std::size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        if (is_aligned<required_alignment>(src_cp) &&
+            is_aligned<required_alignment>(dst_cp)) {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName =
+                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
+                                  n_vecs, enable_sg_loadstore>(nelems, src_tp,
+                                                               dst_tp));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName =
+                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
+                                  n_vecs, disable_sg_loadstore>(nelems, src_tp,
+                                                                dst_tp));
+        }
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get specialized function pointer for casting and copying
+ * contiguous arrays.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_contig_impl<D, S>;
+        return f;
+    }
+};
+
+// Specialization of copy_and_cast for 1D arrays
+
+/*!
+ * @brief Factory to get function pointer for casting and copying 1D arrays.
+ * @ingroup CopyAndCastKernels
+ */
+typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const std::array<ssize_t, 1> &,
+    const std::array<ssize_t, 1> &,
+    const std::array<ssize_t, 1> &,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Factory to get function pointer for casting and copying 2D arrays.
+ * @ingroup CopyAndCastKernels
+ */
+typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const std::array<ssize_t, 2> &,
+    const std::array<ssize_t, 2> &,
+    const std::array<ssize_t, 2> &,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Specialized for given array dimension function to copy `nelems`
+ elements from `src` usm_ndarray to `dst` usm_ndarray while casting from `srcTy`
+ to `dstTy`.
+
+   Both arrays have array dimensionality known at compile time and specified in
+ template parameters `nd`. Arrays' shape and strides are provided as
+ `std::array`. `src_p` and `dst_p` represent pointers into respective arrays,
+ but the start of iteration begins at offset of `src_offset` elements for `src`
+ array and at offset `dst_offset` elements for `dst` array. Kernel is submitted
+ to sycl queue `q` with events `depends` as dependencies.
+
+   @param q  The queue where the routine should be executed.
+   @param nelems  Number of elements to cast and copy.
+   @param shape   Common shape of the arrays.
+   @param src_strides Strides of the source array.
+   @param dst_strides Strides of the destination array.
+   @param src_p  Kernel accessible USM pointer for the source array
+   @param src_offset  Offset to the beginning of iteration in number of elements
+ of the source array from `src_p`.
+   @param dst_p  Kernel accessible USM pointer for the destination array
+   @param dst_offset  Offset to the beginning of iteration in number of elements
+ of the destination array from `src_p`.
+   @param depends  List of events to wait for before starting computations, if
+ any.
+
+   @return  Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy, int nd>
+sycl::event copy_and_cast_nd_specialized_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    const std::array<ssize_t, nd> &shape,
+    const std::array<ssize_t, nd> &src_strides,
+    const std::array<ssize_t, nd> &dst_strides,
+    const char *src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
+        using IndexerT = TwoOffsets_FixedDimStridedIndexer<nd>;
+        const IndexerT indexer{shape, src_strides, dst_strides, src_offset,
+                               dst_offset};
+        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_p);
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+        cgh.depends_on(depends);
+        cgh.parallel_for<
+            class copy_cast_generic_kernel<srcTy, dstTy, IndexerT>>(
+            sycl::range<1>(nelems),
+            GenericCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, IndexerT>(
+                src_tp, dst_tp, indexer));
+    });
+
+    return copy_and_cast_ev;
+}
+
+/*!
+ * @brief Factory to get 1D-specialized function pointer of type `fnT` for given
+ * source data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCast1DFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_nd_specialized_impl<D, S, 1>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get 2D-specialized function pointer of type `fnT` for given
+ * source data type `S` and destination data type `D`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCast2DFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_nd_specialized_impl<D, S, 2>;
+        return f;
+    }
+};
+
+// ====================== Copying from host to USM
+
+template <typename AccessorT,
+          typename dstTy,
+          typename CastFnT,
+          typename IndexerT>
+class GenericCopyFromHostFunctor
+{
+private:
+    AccessorT src_acc_;
+    dstTy *dst_ = nullptr;
+    IndexerT indexer_;
+
+public:
+    GenericCopyFromHostFunctor(const AccessorT &src_acc,
+                               dstTy *dst_p,
+                               const IndexerT &indexer)
+        : src_acc_(src_acc), dst_(dst_p), indexer_(indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const auto &offsets = indexer_(static_cast<ssize_t>(wiid.get(0)));
+        const ssize_t &src_offset = offsets.get_first_offset();
+        const ssize_t &dst_offset = offsets.get_second_offset();
+
+        CastFnT fn{};
+        dst_[dst_offset] = fn(src_acc_[src_offset]);
+    }
+};
+
+typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy from NumPy's ndarray with elements of type `srcTy`
+ * into usm_ndarray with elements of type `srcTy`.
+ *
+ * Function to cast and copy elements from numpy.ndarray specified by typeless
+ * `host_src_p` and the `src_offset` given in the number of array elements.
+ * Arrays' metadata are given in packed USM vector of length `3*nd` whose first
+ * `nd` elements contain arrays' shape, next `nd` elements specify source
+ * strides in elements (not bytes), and trailing `nd` elements specify
+ * destination array strides. Kernel dependencies are given by two vectors of
+ * events: `depends` and `additional_depends`. The function execution is
+ * complete at the return.
+ *
+ * @param q  The queue where the routine should be executed.
+ * @param nelems Number of elements to cast and copy.
+ * @param nd The dimensionality of arrays
+ * @param shape_and_strides  Kernel accessible USM pointer to packed shape and
+ * strides.
+ * @param host_src_p  Host (not USM allocated) pointer associated with the
+ * source array.
+ * @param src_offset  Offset to the beginning of iteration in number of elements
+ * of the source array from `host_src_p`.
+ * @param src_min_nelem_offset  Smallest value of offset relative to
+ * `host_src_p` in number of elements attained while iterating over elements of
+ * the source array.
+ * @param src_max_nelem_offset  Largest value of offset relative to `host_src_p`
+ * in number of elements attained while iterating over elements of the source
+ * array.
+ * @param dst_p  USM pointer associated with the destination array.
+ * @param dst_offset  Offset to the beginning of iteration in number of elements
+ * of the destination array from `dst_p`.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ * @param additional_depends List of additional events to wait for before
+ * starting computations, if any.
+ *
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+void copy_and_cast_from_host_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *host_src_p,
+    ssize_t src_offset,
+    ssize_t src_min_nelem_offset,
+    ssize_t src_max_nelem_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1;
+
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::buffer<srcTy, 1> npy_buf(
+        reinterpret_cast<const srcTy *>(host_src_p) + src_min_nelem_offset,
+        sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}});
+
+    sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
+
+        const TwoOffsets_StridedIndexer indexer{
+            nd, src_offset - src_min_nelem_offset, dst_offset,
+            const_cast<const ssize_t *>(shape_and_strides)};
+
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
+
+        cgh.parallel_for<copy_cast_from_host_kernel<srcTy, dstTy,
+                                                    TwoOffsets_StridedIndexer>>(
+            sycl::range<1>(nelems),
+            GenericCopyFromHostFunctor<decltype(npy_acc), dstTy,
+                                       Caster<srcTy, dstTy>,
+                                       TwoOffsets_StridedIndexer>(
+                npy_acc, dst_tp, indexer));
+    });
+
+    // perform explicit synchronization. Implicit synchronization would be
+    // performed by sycl::buffer destructor.
+    copy_and_cast_from_host_ev.wait();
+
+    return;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given NumPy array
+ * source data type `S` and destination data type `D`.
+ * @defgroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastFromHostFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_from_host_impl<D, S>;
+        return f;
+    }
+};
+
+typedef void (*copy_and_cast_from_host_contig_blocking_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,  /* nelems */
+    const char *, /* src_pointer */
+    ssize_t,      /* src_offset */
+    char *,       /* dst_pointer */
+    ssize_t,      /* dst_offset */
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy from NumPy's ndarray with elements of type `srcTy`
+ * into usm_ndarray with elements of type `srcTy` for contiguous arrays.
+ *
+ * Function to cast and copy elements from numpy.ndarray specified by typeless
+ * `host_src_p` and the `src_offset` given in the number of array elements.
+ * Kernel dependencies are given by two vectors of
+ * events: `depends` and `additional_depends`. The function execution is
+ * complete at the return.
+ *
+ * @param q  The queue where the routine should be executed.
+ * @param nelems Number of elements to cast and copy.
+ * @param src_stride The stride of source array in elements
+ * @param dst_stride The stride of destimation array in elements
+ * @param host_src_p  Host (not USM allocated) pointer associated with the
+ * source array.
+ * @param src_offset  Offset to the beginning of iteration in number of elements
+ * of the source array from `host_src_p`.
+ * @param dst_p  USM pointer associated with the destination array.
+ * @param dst_offset  Offset to the beginning of iteration in number of elements
+ * of the destination array from `dst_p`.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @ingroup CopyAndCastKernels
+ */
+template <typename dstTy, typename srcTy>
+void copy_and_cast_from_host_contig_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    const char *host_src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
+    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
+
+    sycl::buffer<srcTy, 1> npy_buf(
+        reinterpret_cast<const srcTy *>(host_src_p) + src_offset,
+        sycl::range<1>(nelems), {sycl::property::buffer::use_host_ptr{}});
+
+    sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
+
+        using IndexerT = TwoOffsets_CombinedIndexer<NoOpIndexer, NoOpIndexer>;
+        static constexpr NoOpIndexer src_indexer{};
+        static constexpr NoOpIndexer dst_indexer{};
+        static constexpr TwoOffsets_CombinedIndexer indexer{src_indexer,
+                                                            dst_indexer};
+
+        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p) + dst_offset;
+
+        cgh.parallel_for<
+            copy_cast_from_host_contig_kernel<srcTy, dstTy, IndexerT>>(
+            sycl::range<1>(nelems),
+            GenericCopyFromHostFunctor<decltype(npy_acc), dstTy,
+                                       Caster<srcTy, dstTy>, IndexerT>(
+                npy_acc, dst_tp, indexer));
+    });
+
+    // perform explicit synchronization. Implicit synchronization would be
+    // performed by sycl::buffer destructor.
+    copy_and_cast_from_host_ev.wait();
+
+    return;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given NumPy array
+ * source data type `S` and destination data type `D`.
+ * @defgroup CopyAndCastKernels
+ */
+template <typename fnT, typename D, typename S>
+struct CopyAndCastFromHostContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_and_cast_from_host_contig_impl<D, S>;
+        return f;
+    }
+};
+
+// =============== Copying for reshape ================== //
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_reshape_generic_kernel;
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class GenericCopyForReshapeFunctor
+{
+private:
+    const Ty *src_p = nullptr;
+    Ty *dst_p = nullptr;
+    SrcIndexerT src_indexer_;
+    DstIndexerT dst_indexer_;
+
+public:
+    GenericCopyForReshapeFunctor(const char *src_ptr,
+                                 char *dst_ptr,
+                                 const SrcIndexerT &src_indexer,
+                                 const DstIndexerT &dst_indexer)
+        : src_p(reinterpret_cast<const Ty *>(src_ptr)),
+          dst_p(reinterpret_cast<Ty *>(dst_ptr)), src_indexer_(src_indexer),
+          dst_indexer_(dst_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const ssize_t src_offset = src_indexer_(wiid.get(0));
+        const ssize_t dst_offset = dst_indexer_(wiid.get(0));
+
+        dst_p[dst_offset] = src_p[src_offset];
+    }
+};
+
+// define function type
+typedef sycl::event (*copy_for_reshape_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,     // num_elements
+    int,             // src_nd
+    int,             // dst_nd
+    const ssize_t *, // packed shapes and strides
+    const char *,    // src_data_ptr
+    char *,          // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy content of array while reshaping.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index(i,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  nelems The number of elements to copy
+ * @param  src_nd Array dimension of the source array
+ * @param  dst_nd Array dimension of the destination array
+ * @param  packed_shapes_and_strides Kernel accessible USM array of size
+ * `2*src_nd + 2*dst_nd` with content `[src_shape, src_strides, dst_shape,
+ * dst_strides]`.
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event
+    copy_for_reshape_generic_impl(sycl::queue &q,
+                                  std::size_t nelems,
+                                  int src_nd,
+                                  int dst_nd,
+                                  const ssize_t *packed_shapes_and_strides,
+                                  const char *src_p,
+                                  char *dst_p,
+                                  const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides:
+        //   USM array of size 2*(src_nd + dst_nd)
+        //   [ src_shape; src_strides; dst_shape; dst_strides ]
+
+        const ssize_t *src_shape_and_strides =
+            const_cast<const ssize_t *>(packed_shapes_and_strides);
+
+        const ssize_t *dst_shape_and_strides = const_cast<const ssize_t *>(
+            packed_shapes_and_strides + (2 * src_nd));
+
+        const StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides};
+        const StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides};
+
+        using KernelName =
+            copy_for_reshape_generic_kernel<Ty, StridedIndexer, StridedIndexer>;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            GenericCopyForReshapeFunctor<Ty, StridedIndexer, StridedIndexer>(
+                src_p, dst_p, src_indexer, dst_indexer));
+    });
+
+    return copy_for_reshape_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForReshapeGenericFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_reshape_generic_impl<Ty>;
+        return f;
+    }
+};
+
+// ================== Copying for roll ================== //
+
+/*! @brief Functor to cyclically roll global_id to the left */
+struct LeftRolled1DTransformer
+{
+    LeftRolled1DTransformer(std::size_t offset, std::size_t size)
+        : offset_(offset), size_(size)
+    {
+    }
+
+    std::size_t operator()(std::size_t gid) const
+    {
+        const std::size_t shifted_gid =
+            ((gid < offset_) ? gid + size_ - offset_ : gid - offset_);
+        return shifted_gid;
+    }
+
+private:
+    std::size_t offset_ = 0;
+    std::size_t size_ = 1;
+};
+
+/*! @brief Indexer functor to compose indexer and transformer */
+template <typename IndexerT, typename TransformerT>
+struct CompositionIndexer
+{
+    CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {}
+
+    auto operator()(std::size_t gid) const { return f_(t_(gid)); }
+
+private:
+    IndexerT f_;
+    TransformerT t_;
+};
+
+/*! @brief Indexer functor to find offset for nd-shifted indices lifted from
+ * iteration id */
+struct RolledNDIndexer
+{
+    RolledNDIndexer(int nd,
+                    const ssize_t *shape,
+                    const ssize_t *strides,
+                    const ssize_t *ndshifts,
+                    ssize_t starting_offset)
+        : nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts),
+          starting_offset_(starting_offset)
+    {
+    }
+
+    ssize_t operator()(std::size_t gid) const { return compute_offset(gid); }
+
+private:
+    int nd_ = -1;
+    const ssize_t *shape_ = nullptr;
+    const ssize_t *strides_ = nullptr;
+    const ssize_t *ndshifts_ = nullptr;
+    ssize_t starting_offset_ = 0;
+
+    ssize_t compute_offset(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd_);
+        ssize_t relative_offset_(0);
+        _ind.get_left_rolled_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_,    // shape ptr
+            strides_,  // strides ptr
+            ndshifts_, // shifts ptr
+            relative_offset_);
+        return starting_offset_ + relative_offset_;
+    }
+};
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_roll_strided_kernel;
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class StridedCopyForRollFunctor
+{
+private:
+    const Ty *src_p = nullptr;
+    Ty *dst_p = nullptr;
+    SrcIndexerT src_indexer_;
+    DstIndexerT dst_indexer_;
+
+public:
+    StridedCopyForRollFunctor(const Ty *src_ptr,
+                              Ty *dst_ptr,
+                              const SrcIndexerT &src_indexer,
+                              const DstIndexerT &dst_indexer)
+        : src_p(src_ptr), dst_p(dst_ptr), src_indexer_(src_indexer),
+          dst_indexer_(dst_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const std::size_t gid = wiid.get(0);
+
+        const ssize_t src_offset = src_indexer_(gid);
+        const ssize_t dst_offset = dst_indexer_(gid);
+
+        dst_p[dst_offset] = src_p[src_offset];
+    }
+};
+
+// define function type
+typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,     // shift
+    std::size_t,     // num_elements
+    int,             // common_nd
+    const ssize_t *, // packed shapes and strides
+    const char *,    // src_data_ptr
+    ssize_t,         // src_offset
+    char *,          // dst_data_ptr
+    ssize_t,         // dst_offset
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy content of array with a shift.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  shift  The shift in flat indexing, must be non-negative.
+ * @param  nelems The number of elements to copy
+ * @param  nd     Array dimensionality of the destination and source arrays
+ * @param  packed_shapes_and_strides Kernel accessible USM array
+ * of size `3*nd` with content `[common_shape, src_strides, dst_strides]`.
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  src_offset Displacement of first element of src relative src_p in
+ * elements
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  dst_offset Displacement of first element of dst relative dst_p in
+ * elements
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event copy_for_roll_strided_impl(sycl::queue &q,
+                                       std::size_t shift,
+                                       std::size_t nelems,
+                                       int nd,
+                                       const ssize_t *packed_shapes_and_strides,
+                                       const char *src_p,
+                                       ssize_t src_offset,
+                                       char *dst_p,
+                                       ssize_t dst_offset,
+                                       const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides:
+        //   USM array of size 3 * nd
+        //   [ common_shape; src_strides; dst_strides ]
+
+        const StridedIndexer src_indexer{nd, src_offset,
+                                         packed_shapes_and_strides};
+        const LeftRolled1DTransformer left_roll_transformer{shift, nelems};
+
+        using CompositeIndexerT =
+            CompositionIndexer<StridedIndexer, LeftRolled1DTransformer>;
+
+        const CompositeIndexerT rolled_src_indexer(src_indexer,
+                                                   left_roll_transformer);
+
+        UnpackedStridedIndexer dst_indexer{nd, dst_offset,
+                                           packed_shapes_and_strides,
+                                           packed_shapes_and_strides + 2 * nd};
+
+        using KernelName = copy_for_roll_strided_kernel<Ty, CompositeIndexerT,
+                                                        UnpackedStridedIndexer>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<Ty, CompositeIndexerT,
+                                      UnpackedStridedIndexer>(
+                src_tp, dst_tp, rolled_src_indexer, dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+// define function type
+typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,  // shift
+    std::size_t,  // num_elements
+    const char *, // src_data_ptr
+    ssize_t,      // src_offset
+    char *,       // dst_data_ptr
+    ssize_t,      // dst_offset
+    const std::vector<sycl::event> &);
+
+template <typename Ty>
+class copy_for_roll_contig_kernel;
+
+/*!
+ * @brief Function to copy content of array with a shift.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  shift  The shift in flat indexing, must be non-negative.
+ * @param  nelems The number of elements to copy
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  src_offset Displacement of the start of array src relative src_p in
+ * elements
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  dst_offset Displacement of the start of array dst relative dst_p in
+ * elements
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event copy_for_roll_contig_impl(sycl::queue &q,
+                                      std::size_t shift,
+                                      std::size_t nelems,
+                                      const char *src_p,
+                                      ssize_t src_offset,
+                                      char *dst_p,
+                                      ssize_t dst_offset,
+                                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        static constexpr NoOpIndexer src_indexer{};
+        const LeftRolled1DTransformer roller{shift, nelems};
+
+        const CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>
+            left_rolled_src_indexer{src_indexer, roller};
+        static constexpr NoOpIndexer dst_indexer{};
+
+        using KernelName = copy_for_roll_contig_kernel<Ty>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p) + src_offset;
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p) + dst_offset;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<
+                Ty, CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>,
+                NoOpIndexer>(src_tp, dst_tp, left_rolled_src_indexer,
+                             dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForRollStridedFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_strided_impl<Ty>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForRollContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_contig_impl<Ty>;
+        return f;
+    }
+};
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_roll_ndshift_strided_kernel;
+
+// define function type
+typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,     // num_elements
+    int,             // common_nd
+    const ssize_t *, // packed shape, strides, shifts
+    const char *,    // src_data_ptr
+    ssize_t,         // src_offset
+    char *,          // dst_data_ptr
+    ssize_t,         // dst_offset
+    const std::vector<sycl::event> &);
+
+template <typename Ty>
+sycl::event copy_for_roll_ndshift_strided_impl(
+    sycl::queue &q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *packed_shapes_and_strides_and_shifts,
+    const char *src_p,
+    ssize_t src_offset,
+    char *dst_p,
+    ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides_and_shifts:
+        //   USM array of size 4 * nd
+        //   [ common_shape; src_strides; dst_strides; shifts ]
+
+        const ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts;
+        const ssize_t *src_strides_ptr =
+            packed_shapes_and_strides_and_shifts + nd;
+        const ssize_t *dst_strides_ptr =
+            packed_shapes_and_strides_and_shifts + 2 * nd;
+        const ssize_t *shifts_ptr =
+            packed_shapes_and_strides_and_shifts + 3 * nd;
+
+        const RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr,
+                                          shifts_ptr, src_offset};
+
+        const UnpackedStridedIndexer dst_indexer{nd, dst_offset, shape_ptr,
+                                                 dst_strides_ptr};
+
+        using KernelName = copy_for_roll_strided_kernel<Ty, RolledNDIndexer,
+                                                        UnpackedStridedIndexer>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<Ty, RolledNDIndexer,
+                                      UnpackedStridedIndexer>(
+                src_tp, dst_tp, src_indexer, dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty>
+struct CopyForRollNDShiftFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_ndshift_strided_impl<Ty>;
+        return f;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::copy_and_cast
diff --git a/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
new file mode 100644
index 000000000000..a723f6334e7e
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
@@ -0,0 +1,636 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor copying and value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <vector>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::copy_as_contig
+{
+
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename T,
+          typename IndexerT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+class CopyAsCContigFunctor
+{
+private:
+    std::size_t nelems;
+    const T *src_p = nullptr;
+    T *dst_p = nullptr;
+    IndexerT src_indexer;
+
+public:
+    CopyAsCContigFunctor(std::size_t n,
+                         const T *src_,
+                         T *dst_,
+                         const IndexerT &src_indexer_)
+        : nelems(n), src_p(src_), dst_p(dst_), src_indexer(src_indexer_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static_assert(vec_sz > 0);
+        static_assert(n_vecs > 0);
+
+        static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!enable_sg_loadstore || is_complex<T>::value) {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_max_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+
+            // start = (gid / sgSize) * sgSize * elems_per_wi + (gid % sgSize)
+            // gid % sgSize == gid - (gid / sgSize) * sgSize
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + elems_per_sg);
+
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                auto src_offset = src_indexer(offset);
+                dst_p[offset] = src_p[src_offset];
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+            const std::uint16_t elems_per_sg = elems_per_wi * sgSize;
+
+            if (base + elems_per_sg < nelems) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    // it == vec_id * vec_sz, for  0 <= vec_id < n_vecs
+                    const std::size_t block_start_id = base + it * sgSize;
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[block_start_id]);
+
+                    const std::size_t elem_id0 =
+                        block_start_id + sg.get_local_id();
+                    sycl::vec<T, vec_sz> dst_vec;
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                        const std::size_t elem_id = elem_id0 + k * sgSize;
+                        const ssize_t src_offset = src_indexer(elem_id);
+                        dst_vec[k] = src_p[src_offset];
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                const std::size_t k0 = base + lane_id;
+                for (std::size_t k = k0; k < nelems; k += sgSize) {
+                    const ssize_t src_offset = src_indexer(k);
+                    dst_p[k] = src_p[src_offset];
+                }
+            }
+        }
+    }
+};
+
+template <typename T,
+          typename IndexerT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs,
+          bool enable_sg_load,
+          typename KernelName>
+sycl::event submit_c_contiguous_copy(sycl::queue &exec_q,
+                                     std::size_t nelems,
+                                     const T *src,
+                                     T *dst,
+                                     const IndexerT &src_indexer,
+                                     const std::vector<sycl::event> &depends)
+{
+    static_assert(vec_sz > 0);
+    static_assert(n_vecs > 0);
+
+    static constexpr std::size_t preferred_lws = 256;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = exec_q.get_context();
+    auto const &dev = exec_q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t max_sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    const std::size_t lws =
+        ((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size;
+
+    static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+
+    const std::size_t nelems_per_group = nelems_per_wi * lws;
+    const std::size_t n_groups =
+        (nelems + nelems_per_group - 1) / (nelems_per_group);
+
+    sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.use_kernel_bundle(kb);
+
+        const sycl::range<1> gRange{n_groups * lws};
+        const sycl::range<1> lRange{lws};
+
+        cgh.parallel_for<KernelName>(
+            sycl::nd_range<1>(gRange, lRange),
+            CopyAsCContigFunctor<T, IndexerT, vec_sz, n_vecs, enable_sg_load>(
+                nelems, src, dst, src_indexer));
+    });
+    return copy_ev;
+}
+
+template <typename T,
+          typename IndexT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs,
+          bool enable_sgload>
+class as_contig_krn;
+
+template <typename T>
+sycl::event
+    as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
+                                       std::size_t nelems,
+                                       int nd,
+                                       const ssize_t *shape_and_strides,
+                                       const char *src_p,
+                                       char *dst_p,
+                                       const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
+
+    const T *src_tp = reinterpret_cast<const T *>(src_p);
+    T *dst_tp = reinterpret_cast<T *>(dst_p);
+
+    using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides);
+
+    static constexpr std::uint8_t vec_sz = 4u;
+    static constexpr std::uint8_t n_vecs = 2u;
+
+    using dpctl::tensor::kernels::alignment_utils::
+        disabled_sg_loadstore_wrapper_krn;
+    using dpctl::tensor::kernels::alignment_utils::is_aligned;
+    using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+    sycl::event copy_ev;
+    if (is_aligned<required_alignment>(dst_p)) {
+        static constexpr bool enable_sg_load = true;
+        using KernelName =
+            as_contig_krn<T, IndexerT, vec_sz, n_vecs, enable_sg_load>;
+        copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs,
+                                           enable_sg_load, KernelName>(
+            exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
+    }
+    else {
+        static constexpr bool disable_sg_load = false;
+        using InnerKernelName =
+            as_contig_krn<T, IndexerT, vec_sz, n_vecs, disable_sg_load>;
+        using KernelName = disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+        copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs,
+                                           disable_sg_load, KernelName>(
+            exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
+    }
+
+    return copy_ev;
+}
+
+typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContigFactory
+{
+    fnT get() { return as_c_contiguous_array_generic_impl<T>; }
+};
+
+template <typename T,
+          typename IndexerT,
+          std::uint16_t tile_size,
+          std::uint16_t n_lines>
+class as_contig_batch_of_square_matrices_krn;
+
+namespace detail
+{
+/*! @brief batch of matrices (n, n), source strides (1, src_ld), destination
+   strides (dst_ld, 1) src and destination arrays must be disjoint memory blocks
+   to avoid race condition
+ */
+template <typename T, typename BatchIndexerT>
+sycl::event as_c_contiguous_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    std::size_t batch_nelems,
+    const BatchIndexerT &batch_two_offsets_indexer,
+    std::size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
+
+    const T *src_tp = reinterpret_cast<const T *>(src_p);
+    T *dst_tp = reinterpret_cast<T *>(dst_p);
+
+    static constexpr std::uint16_t private_tile_size = 4;
+    static constexpr std::uint16_t n_lines = 2;
+    static constexpr std::uint16_t block_size =
+        n_lines * private_tile_size * private_tile_size;
+
+    static constexpr std::uint16_t lws0 = block_size;
+    static constexpr std::uint16_t lws1 = n_lines;
+    static constexpr std::uint16_t nelems_per_wi = (block_size / lws1);
+
+    static_assert(nelems_per_wi * lws1 == block_size);
+    static_assert(nelems_per_wi == private_tile_size * private_tile_size);
+
+    static constexpr std::uint32_t lws = lws0 * lws1;
+
+    const std::size_t n_tiles = (n + block_size - 1) / block_size;
+
+    const ssize_t src_stride = src_ld;
+    const ssize_t dst_stride = dst_ld;
+
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{batch_nelems * n_tiles * n_tiles * lws};
+
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    using KernelName =
+        as_contig_batch_of_square_matrices_krn<T, BatchIndexerT,
+                                               private_tile_size, lws1>;
+
+    sycl::event e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::local_accessor<T, 1> local_block(block_size * block_size, cgh);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> nd_it) {
+            // 1. Read block from source array into SLM
+            const std::uint32_t lid_lin = nd_it.get_local_linear_id();
+            const std::size_t gr_id_lin = nd_it.get_group_linear_id();
+
+            const std::size_t batch_id = gr_id_lin / (n_tiles * n_tiles);
+            const std::size_t rem = gr_id_lin - batch_id * (n_tiles * n_tiles);
+
+            const auto &batch_two_offsets = batch_two_offsets_indexer(batch_id);
+            const auto &src_batch_offset = batch_two_offsets.get_first_offset();
+            const auto &dst_batch_offset =
+                batch_two_offsets.get_second_offset();
+
+            // Block id
+            /* 0 <= src_gr_i1 < n_groups_n1 */
+            const std::size_t src_tile_i1 = rem / n_tiles;
+            /* 0 <= src_gr_i0 < n_groups_n0 */
+            const std::size_t src_tile_i0 = rem - src_tile_i1 * n_tiles;
+
+            // ID of element within the block
+            /* 0 <= src_i1 < lws1 */
+            const std::uint32_t src_i1 = lid_lin / lws0;
+            /* 0 <= src_i0 < lws0 */
+            const std::uint32_t src_i0 = lid_lin - src_i1 * lws0;
+
+            // Matrix element ID
+            const std::size_t src_tile_start0 = src_tile_i0 * block_size;
+            const std::size_t src_tile_start1 = src_tile_i1 * block_size;
+            const std::size_t src_gid0 = (src_tile_start0 + src_i0);
+            const std::size_t src_gid1 = (src_tile_start1 + src_i1);
+
+            // src_offset = src_gid0 * 1 + (src_gid1 + pr_id * lws1) *
+            // src_stride
+            const std::size_t src_offset0 =
+                src_batch_offset + src_gid0 * 1 + src_gid1 * src_stride;
+            const std::size_t pr_step_src = lws1 * src_stride;
+
+            const std::uint32_t local_offset0 = src_i0 + src_i1 * block_size;
+            const std::uint32_t pr_step_local = lws1 * block_size;
+
+            for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
+                local_block[local_offset0 + pr_step_local * pr_id] =
+                    (src_gid0 < n && src_gid1 + pr_id * lws1 < n)
+                        ? src_tp[src_offset0 + pr_step_src * pr_id]
+                        : T(0);
+            }
+
+            const std::uint32_t local_dim0 = static_cast<std::uint32_t>(
+                std::min<std::size_t>(src_tile_start0 + block_size, n) -
+                src_tile_start0);
+            const std::uint32_t local_dim1 = static_cast<std::uint32_t>(
+                std::min<std::size_t>(src_tile_start1 + block_size, n) -
+                src_tile_start1);
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            // 2. Permute the block matrix in SLM using two private arrays
+            std::array<T, nelems_per_wi> private_block_01 = {T(0)};
+            std::array<T, nelems_per_wi> private_block_10 = {T(0)};
+
+            // 0 <= lid_lin < lws0 * lws1 ==
+            //       (block_size * block_size / nelems_per_wi) ==
+            //       (block_size/private_tile_size)**2
+            static constexpr std::uint16_t n_private_tiles_per_axis =
+                block_size / private_tile_size;
+            const std::uint16_t local_tile_id0 =
+                lid_lin / n_private_tiles_per_axis;
+            const std::uint16_t local_tile_id1 =
+                lid_lin - local_tile_id0 * n_private_tiles_per_axis;
+
+            if (local_tile_id0 <= local_tile_id1) {
+                for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size;
+                     ++pr_i0) {
+                    for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size;
+                         ++pr_i1) {
+                        const std::uint16_t t0_offset =
+                            local_tile_id0 * private_tile_size;
+                        const std::uint16_t t1_offset =
+                            local_tile_id1 * private_tile_size;
+
+                        const std::uint16_t pr_offset =
+                            pr_i1 * private_tile_size + pr_i0;
+                        const std::uint16_t rel_offset =
+                            pr_i0 + pr_i1 * block_size;
+
+                        // read (local_tile_id0, local_tile_id1)
+                        const std::uint16_t local_01_offset =
+                            (t0_offset + t1_offset * block_size) + rel_offset;
+                        private_block_01[pr_offset] =
+                            local_block[local_01_offset];
+
+                        // read (local_tile_id1, local_tile_id0)
+                        const std::uint16_t local_10_offset =
+                            (t1_offset + t0_offset * block_size) + rel_offset;
+                        private_block_10[pr_offset] =
+                            local_block[local_10_offset];
+                    }
+                }
+            }
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            if (local_tile_id0 <= local_tile_id1) {
+                for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size;
+                     ++pr_i0) {
+                    for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size;
+                         ++pr_i1) {
+                        const std::uint16_t t0_offset =
+                            local_tile_id0 * private_tile_size;
+                        const std::uint16_t t1_offset =
+                            local_tile_id1 * private_tile_size;
+                        const std::uint16_t pr_offset =
+                            pr_i0 * private_tile_size + pr_i1;
+
+                        const std::uint16_t rel_offset =
+                            pr_i0 + pr_i1 * block_size;
+
+                        // write back permuted private blocks
+                        const std::uint32_t local_01_offset =
+                            (t0_offset + t1_offset * block_size) + rel_offset;
+                        local_block[local_01_offset] =
+                            private_block_10[pr_offset];
+
+                        const std::uint16_t local_10_offset =
+                            (t1_offset + t0_offset * block_size) + rel_offset;
+                        local_block[local_10_offset] =
+                            private_block_01[pr_offset];
+                    }
+                }
+            }
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            // 3. Write out permuted SLM to destination array
+
+            const std::size_t dst_tile_start0 = src_tile_start0;
+            const std::size_t dst_tile_start1 = src_tile_start1;
+
+            if (local_dim0 == block_size && local_dim1 == block_size) {
+                const std::uint16_t dst_i0 = src_i1;
+                const std::uint16_t dst_i1 = src_i0;
+
+                const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
+                const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
+
+                const std::size_t dst_offset0 =
+                    dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
+                const std::size_t pr_step_dst = lws1 * dst_stride;
+
+                const std::uint16_t _local_offset0 =
+                    dst_i0 * block_size + dst_i1;
+                const std::uint16_t _pr_step_local = lws1 * block_size;
+
+                for (std::uint16_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
+                    if ((dst_gid1 < n) && ((dst_gid0 + pr_id * lws1) < n)) {
+                        dst_tp[dst_offset0 + pr_step_dst * pr_id] =
+                            local_block[_local_offset0 +
+                                        _pr_step_local * pr_id];
+                    }
+                }
+            }
+            else {
+                // map local_linear_id into (local_dim0, local_dim1)
+                for (std::uint16_t el_id = lid_lin;
+                     el_id < local_dim0 * local_dim1; el_id += lws0 * lws1) {
+
+                    // 0 <= local_i0 < local_dim0
+                    const std::uint16_t loc_i0 = el_id / local_dim1;
+                    // 0 <= local_i1 < local_dim1
+                    const std::uint16_t loc_i1 = el_id - loc_i0 * local_dim1;
+
+                    const std::uint16_t dst_i0 = loc_i0;
+                    const std::uint16_t dst_i1 = loc_i1;
+
+                    const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
+                    const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
+
+                    const std::size_t dst_offset =
+                        dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
+                    const std::uint16_t local_offset =
+                        loc_i0 * block_size + loc_i1;
+
+                    if ((dst_gid1 < n) && (dst_gid0 < n)) {
+                        dst_tp[dst_offset] = local_block[local_offset];
+                    }
+                }
+            }
+        });
+    });
+
+    return e;
+}
+
+} // end of namespace detail
+
+template <typename T>
+sycl::event as_c_contiguous_1d_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    std::size_t batch_nelems,
+    ssize_t src_batch_step,
+    ssize_t dst_batch_step,
+    std::size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    using dpctl::tensor::offset_utils::Strided1DIndexer;
+    using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer;
+    using BatchIndexerT =
+        TwoOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer>;
+
+    const auto &src_batch_indexer =
+        Strided1DIndexer(batch_nelems, src_batch_step);
+    const auto &dst_batch_indexer =
+        Strided1DIndexer(batch_nelems, dst_batch_step);
+
+    const BatchIndexerT batch_two_indexer{src_batch_indexer, dst_batch_indexer};
+
+    return detail::as_c_contiguous_batch_of_square_matrices_impl<T,
+                                                                 BatchIndexerT>(
+        exec_q, batch_nelems, batch_two_indexer, n, src_p, src_ld, dst_p,
+        dst_ld, depends);
+}
+
+typedef sycl::event (
+    *as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t)(
+    sycl::queue &, /* execution queue */
+    std::size_t,   /* number of batch elements */
+    ssize_t,       /* distance between batches in source array */
+    ssize_t,       /* distance between batches in destination array */
+    std::size_t,   /* size of square matrices in the batch */
+    const char *,
+    ssize_t, /* untyped pointer to F-contig source array, and matrix leading
+                dimension */
+    char *,
+    ssize_t, /* untyped pointer to C-contig destination array, and matrix
+                leading dimension */
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContig1DBatchOfSquareMatricesFactory
+{
+    fnT get() { return as_c_contiguous_1d_batch_of_square_matrices_impl<T>; }
+};
+
+template <typename T>
+sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    std::size_t batch_nelems,
+    int batch_nd,
+    const ssize_t *src_batch_shape_strides,
+    const ssize_t dst_batch_step,
+    std::size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    using SrcIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    using DstIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+    using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer;
+    using BatchIndexerT = TwoOffsets_CombinedIndexer<SrcIndexerT, DstIndexerT>;
+
+    static constexpr ssize_t zero_offset{0};
+
+    const SrcIndexerT src_batch_indexer{batch_nd, zero_offset,
+                                        src_batch_shape_strides};
+    const DstIndexerT dst_batch_indexer{/* size */ batch_nelems,
+                                        /* step */ dst_batch_step};
+
+    const BatchIndexerT batch_two_offsets_indexer{src_batch_indexer,
+                                                  dst_batch_indexer};
+
+    return detail::as_c_contiguous_batch_of_square_matrices_impl<T,
+                                                                 BatchIndexerT>(
+        exec_q, batch_nelems, batch_two_offsets_indexer, n, src_p, src_ld,
+        dst_p, dst_ld, depends);
+}
+
+typedef sycl::event (
+    *as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t)(
+    sycl::queue &, /* execution queue */
+    std::size_t,   /* number of matrices in the batch */
+    int,
+    const ssize_t *, /* dimensionality, and packed [shape, src_strides]
+                        describing iteration over batch in source array */
+    ssize_t,         /* distance between batches in destination array */
+    std::size_t,     /* matrix size */
+    const char *,
+    ssize_t, /* untyped pointer to source array of F-contig matrices, and
+                leading dimension of the matrix */
+    char *,
+    ssize_t, /* untyped pointer to destination array of F-contig matrices, and
+                leading dimension of the matrix */
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContigNDBatchOfSquareMatricesFactory
+{
+    fnT get() { return as_c_contiguous_nd_batch_of_square_matrices_impl<T>; }
+};
+} // namespace dpctl::tensor::kernels::copy_as_contig
diff --git a/dpnp/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpnp/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
new file mode 100644
index 000000000000..4db78e1805e3
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
@@ -0,0 +1,40 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+
+namespace dpctl::tensor
+{
+typedef std::ptrdiff_t ssize_t;
+} // namespace dpctl::tensor
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
new file mode 100644
index 000000000000..250ba1d70455
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -0,0 +1,237 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ABS(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "cabs_impl.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::abs
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AbsFunctor
+{
+
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::false_type;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &x) const
+    {
+
+        if constexpr (std::is_same_v<argT, bool> ||
+                      (std::is_integral<argT>::value &&
+                       std::is_unsigned<argT>::value)) {
+            static_assert(std::is_same_v<resT, argT>);
+            return x;
+        }
+        else {
+            if constexpr (is_complex<argT>::value) {
+                return detail::cabs(x);
+            }
+            else if constexpr (std::is_same_v<argT, sycl::half> ||
+                               std::is_floating_point_v<argT>) {
+                return (sycl::signbit(x) ? -x : x);
+            }
+            else {
+                return sycl::abs(x);
+            }
+        }
+    }
+};
+
+template <typename argT,
+          typename resT = argT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AbsContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           AbsFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename T>
+struct AbsOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, bool>,
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, float>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy>
+struct AbsContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class abs_contig_kernel;
+
+template <typename argTy>
+sycl::event abs_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using AbsHS = hyperparam_detail::AbsContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AbsHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AbsHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AbsContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AbsOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = abs_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AbsTypeMapFactory
+{
+    /*! @brief get typeid for output type of abs(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AbsOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AbsStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AbsFunctor<argTy, resTy>>;
+
+template <typename T1, typename T2, typename T3>
+class abs_strided_kernel;
+
+template <typename argTy>
+sycl::event abs_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AbsOutputType, AbsStridedFunctor, abs_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AbsStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AbsOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = abs_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::abs
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
new file mode 100644
index 000000000000..9ceeb0947439
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -0,0 +1,273 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ACOS(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::acos
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AcosFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            if (std::isnan(x)) {
+                /* acos(NaN + I*+-Inf) = NaN + I*-+Inf */
+                if (std::isinf(y)) {
+                    return resT{q_nan, -y};
+                }
+
+                /* all other cases involving NaN return NaN + I*NaN. */
+                return resT{q_nan, q_nan};
+            }
+            if (std::isnan(y)) {
+                /* acos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
+                if (std::isinf(x)) {
+                    return resT{q_nan, -std::numeric_limits<realT>::infinity()};
+                }
+                /* acos(0 + I*NaN) = PI/2 + I*NaN with inexact */
+                if (x == realT(0)) {
+                    const realT res_re = sycl::atan(realT(1)) * 2; // PI/2
+                    return resT{res_re, q_nan};
+                }
+
+                /* all other cases involving NaN return NaN + I*NaN. */
+                return resT{q_nan, q_nan};
+            }
+
+            /*
+             * For large x or y including acos(+-Inf + I*+-Inf)
+             */
+            static constexpr realT r_eps =
+                realT(1) / std::numeric_limits<realT>::epsilon();
+            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
+                using sycl_complexT = exprm_ns::complex<realT>;
+                sycl_complexT log_in =
+                    exprm_ns::log(exprm_ns::complex<realT>(in));
+
+                const realT wx = log_in.real();
+                const realT wy = log_in.imag();
+                const realT rx = sycl::fabs(wy);
+
+                realT ry = wx + sycl::log(realT(2));
+                return resT{rx, (sycl::signbit(y)) ? ry : -ry};
+            }
+
+            /* ordinary cases */
+            return exprm_ns::acos(exprm_ns::complex<realT>(in)); // acos(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::acos(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AcosContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AcosFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AcosStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AcosFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AcosOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy>
+struct AcosContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class acos_contig_kernel;
+
+template <typename argTy>
+sycl::event acos_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using AcosHS = hyperparam_detail::AcosContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AcosHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AcosHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AcosOutputType, AcosContigFunctor, acos_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AcosContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AcosOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = acos_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AcosTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::acos(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AcosOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class acos_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    acos_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AcosOutputType, AcosStridedFunctor, acos_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AcosStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AcosOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = acos_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::acos
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
new file mode 100644
index 000000000000..e356b37361d8
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -0,0 +1,304 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ACOSH(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::acosh
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AcoshFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+            /*
+             * acosh(in) = I*acos(in) or -I*acos(in)
+             * where the sign is chosen so Re(acosh(in)) >= 0.
+             * So, we first calculate acos(in) and then acosh(in).
+             */
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            resT acos_in;
+            if (std::isnan(x)) {
+                /* acos(NaN + I*+-Inf) = NaN + I*-+Inf */
+                if (std::isinf(y)) {
+                    acos_in = resT{q_nan, -y};
+                }
+                else {
+                    acos_in = resT{q_nan, q_nan};
+                }
+            }
+            else if (std::isnan(y)) {
+                /* acos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
+                static constexpr realT inf =
+                    std::numeric_limits<realT>::infinity();
+
+                if (std::isinf(x)) {
+                    acos_in = resT{q_nan, -inf};
+                }
+                /* acos(0 + I*NaN) = Pi/2 + I*NaN with inexact */
+                else if (x == realT(0)) {
+                    const realT pi_half = sycl::atan(realT(1)) * 2;
+                    acos_in = resT{pi_half, q_nan};
+                }
+                else {
+                    acos_in = resT{q_nan, q_nan};
+                }
+            }
+
+            static constexpr realT r_eps =
+                realT(1) / std::numeric_limits<realT>::epsilon();
+            /*
+             * For large x or y including acos(+-Inf + I*+-Inf)
+             */
+            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
+                using sycl_complexT = typename exprm_ns::complex<realT>;
+                const sycl_complexT log_in = exprm_ns::log(sycl_complexT(in));
+                const realT wx = log_in.real();
+                const realT wy = log_in.imag();
+                const realT rx = sycl::fabs(wy);
+                realT ry = wx + sycl::log(realT(2));
+                acos_in = resT{rx, (sycl::signbit(y)) ? ry : -ry};
+            }
+            else {
+                /* ordinary cases */
+                acos_in =
+                    exprm_ns::acos(exprm_ns::complex<realT>(in)); // acos(in);
+            }
+
+            /* Now we calculate acosh(z) */
+            const realT rx = std::real(acos_in);
+            const realT ry = std::imag(acos_in);
+
+            /* acosh(NaN + I*NaN) = NaN + I*NaN */
+            if (std::isnan(rx) && std::isnan(ry)) {
+                return resT{ry, rx};
+            }
+            /* acosh(NaN + I*+-Inf) = +Inf + I*NaN */
+            /* acosh(+-Inf + I*NaN) = +Inf + I*NaN */
+            if (std::isnan(rx)) {
+                return resT{sycl::fabs(ry), rx};
+            }
+            /* acosh(0 + I*NaN) = NaN + I*NaN */
+            if (std::isnan(ry)) {
+                return resT{ry, ry};
+            }
+            /* ordinary cases */
+            const realT res_im = sycl::copysign(rx, std::imag(in));
+            return resT{sycl::fabs(ry), res_im};
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::acosh(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AcoshContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AcoshFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AcoshStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AcoshFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AcoshOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct AcoshContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class acosh_contig_kernel;
+
+template <typename argTy>
+sycl::event acosh_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using AcoshHS = hyperparam_detail::AcoshContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AcoshHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AcoshHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AcoshOutputType, AcoshContigFunctor, acosh_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AcoshContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AcoshOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = acosh_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AcoshTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::acosh(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AcoshOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class acosh_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    acosh_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AcoshOutputType, AcoshStridedFunctor, acosh_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AcoshStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AcoshOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = acosh_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::acosh
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
new file mode 100644
index 000000000000..c7386f99236a
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -0,0 +1,679 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ADD(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::add
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct AddFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value) {
+            using rT1 = typename argT1::value_type;
+            using rT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<rT1>(in1) + exprm_ns::complex<rT2>(in2);
+        }
+        else if constexpr (tu_ns::is_complex<argT1>::value &&
+                           !tu_ns::is_complex<argT2>::value) {
+            using rT1 = typename argT1::value_type;
+
+            return exprm_ns::complex<rT1>(in1) + in2;
+        }
+        else if constexpr (!tu_ns::is_complex<argT1>::value &&
+                           tu_ns::is_complex<argT2>::value) {
+            using rT2 = typename argT2::value_type;
+
+            return in1 + exprm_ns::complex<rT2>(in2);
+        }
+        else {
+            return in1 + in2;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = in1 + in2;
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AddContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            AddFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using AddStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             AddFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct AddOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct AddContigHyperparameterSet
+{
+    using value_type = typename std::disjunction<
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           std::int32_t,
+                                           argTy2,
+                                           std::int32_t,
+                                           2u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           std::int32_t,
+                                           argTy2,
+                                           std::int32_t,
+                                           2u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           std::int64_t,
+                                           argTy2,
+                                           std::int64_t,
+                                           2u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           std::uint64_t,
+                                           argTy2,
+                                           std::uint64_t,
+                                           2u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           float,
+                                           argTy2,
+                                           float,
+                                           2u,
+                                           2u>,
+        BinaryContigHyperparameterSetEntry<argTy1,
+                                           double,
+                                           argTy2,
+                                           double,
+                                           1u,
+                                           2u>,
+        ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class add_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event add_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using AddHS = hyperparam_detail::AddContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr auto vec_sz = AddHS::vec_sz;
+    static constexpr auto n_vecs = AddHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct AddContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = add_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct AddTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::add(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AddOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class add_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event add_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, AddOutputType, AddStridedFunctor, add_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct AddStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = add_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+class add_matrix_row_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+using AddContigMatrixContigRowBroadcastingFunctor =
+    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        AddFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event add_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] + vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, AddContigMatrixContigRowBroadcastingFunctor,
+        add_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
+                                         mat_offset, vec_p, vec_offset, res_p,
+                                         res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct AddContigMatrixContigRowBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename AddOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    add_contig_matrix_contig_row_broadcast_impl<T1, T2, resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event add_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] + vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return add_contig_matrix_contig_row_broadcast_impl<argT2, argT1, resT>(
+        exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p,
+        res_offset, depends);
+};
+
+template <typename fnT, typename T1, typename T2>
+struct AddContigRowContigMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename AddOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    add_contig_row_contig_matrix_broadcast_impl<T1, T2, resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct AddInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in) { res += in; }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        res += in;
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
+    argT,
+    resT,
+    AddInplaceFunctor<argT, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using AddInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        AddInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class add_inplace_contig_kernel;
+
+/* @brief Types supported by in-place add */
+template <typename argTy, typename resTy>
+struct AddInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    resTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    resTy,
+                                    std::complex<double>>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct AddInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x += y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (AddInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    add_inplace_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            ssize_t arg_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    static constexpr auto vec_sz =
+        hyperparam_detail::AddContigHyperparameterSet<resTy, argTy>::vec_sz;
+    static constexpr auto n_vecs =
+        hyperparam_detail::AddContigHyperparameterSet<resTy, argTy>::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, AddInplaceContigFunctor, add_inplace_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, arg_offset, res_p, res_offset,
+                        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct AddInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = add_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class add_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+    add_inplace_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, AddInplaceStridedFunctor, add_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct AddInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = add_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+class add_inplace_row_matrix_broadcast_sg_krn;
+
+template <typename argT, typename resT>
+using AddInplaceRowMatrixBroadcastingFunctor =
+    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
+        argT,
+        resT,
+        AddInplaceFunctor<argT, resT>>;
+
+template <typename argT, typename resT>
+sycl::event add_inplace_row_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
+        argT, resT, AddInplaceRowMatrixBroadcastingFunctor,
+        add_inplace_row_matrix_broadcast_sg_krn>(exec_q, host_tasks, n0, n1,
+                                                 vec_p, vec_offset, mat_p,
+                                                 mat_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct AddInplaceRowMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!AddInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value) {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn = add_inplace_row_matrix_broadcast_impl<T1, T2>;
+                return fn;
+            }
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::add
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
new file mode 100644
index 000000000000..93dbd648e575
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
@@ -0,0 +1,215 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ANGLE(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::angle
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AngleFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        using rT = typename argT::value_type;
+
+        return exprm_ns::arg(exprm_ns::complex<rT>(in)); // arg(in);
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AngleContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AngleFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AngleStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AngleFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AngleOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, std::complex<float>, float>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct AngleContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class angle_contig_kernel;
+
+template <typename argTy>
+sycl::event angle_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using AngleHS = hyperparam_detail::AngleContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AngleHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AngleHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AngleOutputType, AngleContigFunctor, angle_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AngleContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AngleOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = angle_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AngleTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::arg(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AngleOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class angle_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    angle_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AngleOutputType, AngleStridedFunctor, angle_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AngleStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AngleOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = angle_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::angle
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
new file mode 100644
index 000000000000..d367c1243628
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -0,0 +1,296 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ASIN(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::asin
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AsinFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            /*
+             * asin(in) = I * conj( asinh(I * conj(in)) )
+             * so we first calculate w = asinh(I * conj(in)) with
+             * x = real(I * conj(in)) = imag(in)
+             * y = imag(I * conj(in)) = real(in)
+             * and then return {imag(w), real(w)} which is asin(in)
+             */
+            const realT x = std::imag(in);
+            const realT y = std::real(in);
+
+            if (std::isnan(x)) {
+                /* asinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */
+                if (std::isinf(y)) {
+                    const realT asinh_re = y;
+                    const realT asinh_im = q_nan;
+                    return resT{asinh_im, asinh_re};
+                }
+                /* asinh(NaN + I*0) = NaN + I*0 */
+                if (y == realT(0)) {
+                    const realT asinh_re = q_nan;
+                    const realT asinh_im = y;
+                    return resT{asinh_im, asinh_re};
+                }
+                /* All other cases involving NaN return NaN + I*NaN. */
+                return resT{q_nan, q_nan};
+            }
+            else if (std::isnan(y)) {
+                /* asinh(+-Inf + I*NaN) = +-Inf + I*NaN */
+                if (std::isinf(x)) {
+                    const realT asinh_re = x;
+                    const realT asinh_im = q_nan;
+                    return resT{asinh_im, asinh_re};
+                }
+                /* All other cases involving NaN return NaN + I*NaN. */
+                return resT{q_nan, q_nan};
+            }
+
+            /*
+             * For large x or y including asinh(+-Inf + I*+-Inf)
+             * asinh(in) = sign(x)*log(sign(x)*in) + O(1/in^2)   as in ->
+             * infinity The above formula works for the imaginary part as well,
+             * because Im(asinh(in)) = sign(x)*atan2(sign(x)*y, fabs(x)) +
+             * O(y/in^3) as in -> infinity, uniformly in y
+             */
+            static constexpr realT r_eps =
+                realT(1) / std::numeric_limits<realT>::epsilon();
+            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
+                using sycl_complexT = exprm_ns::complex<realT>;
+                const sycl_complexT z{x, y};
+                realT wx, wy;
+                if (!sycl::signbit(x)) {
+                    const auto log_z = exprm_ns::log(z);
+                    wx = log_z.real() + sycl::log(realT(2));
+                    wy = log_z.imag();
+                }
+                else {
+                    const auto log_mz = exprm_ns::log(-z);
+                    wx = log_mz.real() + sycl::log(realT(2));
+                    wy = log_mz.imag();
+                }
+                const realT asinh_re = sycl::copysign(wx, x);
+                const realT asinh_im = sycl::copysign(wy, y);
+                return resT{asinh_im, asinh_re};
+            }
+            /* ordinary cases */
+            return exprm_ns::asin(
+                exprm_ns::complex<realT>(in)); // sycl::asin(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::asin(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AsinContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AsinFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AsinStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AsinFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AsinOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct AsinContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class asin_contig_kernel;
+
+template <typename argTy>
+sycl::event asin_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using AddHS = hyperparam_detail::AsinContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AddHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AddHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AsinOutputType, AsinContigFunctor, asin_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AsinContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AsinOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = asin_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AsinTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::asin(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AsinOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class asin_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    asin_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AsinOutputType, AsinStridedFunctor, asin_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AsinStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AsinOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = asin_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::asin
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
new file mode 100644
index 000000000000..472e04f7cbe8
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -0,0 +1,279 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ASINH(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::asinh
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AsinhFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            if (std::isnan(x)) {
+                /* asinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */
+                if (std::isinf(y)) {
+                    return resT{y, q_nan};
+                }
+                /* asinh(NaN + I*0) = NaN + I*0 */
+                if (y == realT(0)) {
+                    return resT{q_nan, y};
+                }
+                /* All other cases involving NaN return NaN + I*NaN. */
+                return resT{q_nan, q_nan};
+            }
+
+            if (std::isnan(y)) {
+                /* asinh(+-Inf + I*NaN) = +-Inf + I*NaN */
+                if (std::isinf(x)) {
+                    return resT{x, q_nan};
+                }
+                /* All other cases involving NaN return NaN + I*NaN. */
+                return resT{q_nan, q_nan};
+            }
+
+            /*
+             * For large x or y including asinh(+-Inf + I*+-Inf)
+             * asinh(in) = sign(x)*log(sign(x)*in) + O(1/in^2)   as in ->
+             * infinity The above formula works for the imaginary part as well,
+             * because Im(asinh(in)) = sign(x)*atan2(sign(x)*y, fabs(x)) +
+             * O(y/in^3) as in -> infinity, uniformly in y
+             */
+            static constexpr realT r_eps =
+                realT(1) / std::numeric_limits<realT>::epsilon();
+
+            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
+                using sycl_complexT = exprm_ns::complex<realT>;
+                sycl_complexT log_in = (sycl::signbit(x))
+                                           ? exprm_ns::log(sycl_complexT(-in))
+                                           : exprm_ns::log(sycl_complexT(in));
+                realT wx = log_in.real() + sycl::log(realT(2));
+                realT wy = log_in.imag();
+
+                const realT res_re = sycl::copysign(wx, x);
+                const realT res_im = sycl::copysign(wy, y);
+                return resT{res_re, res_im};
+            }
+
+            /* ordinary cases */
+            return exprm_ns::asinh(exprm_ns::complex<realT>(in)); // asinh(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::asinh(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AsinhContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AsinhFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AsinhStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AsinhFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AsinhOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct AsinhContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class asinh_contig_kernel;
+
+template <typename argTy>
+sycl::event asinh_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using AsinhHS = hyperparam_detail::AsinhContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AsinhHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AsinhHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AsinhOutputType, AsinhContigFunctor, asinh_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AsinhContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AsinhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = asinh_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AsinhTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::asinh(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AsinhOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class asinh_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    asinh_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AsinhOutputType, AsinhStridedFunctor, asinh_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AsinhStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AsinhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = asinh_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::asinh
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
new file mode 100644
index 000000000000..ab07a3fce3e0
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
@@ -0,0 +1,288 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ATAN(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::atan
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::vec_size_utils::ContigHyperparameterSetDefault;
+using dpctl::tensor::kernels::vec_size_utils::UnaryContigHyperparameterSetEntry;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AtanFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+            /*
+             * atan(in) = I * conj( atanh(I * conj(in)) )
+             * so we first calculate w = atanh(I * conj(in)) with
+             * x = real(I * conj(in)) = imag(in)
+             * y = imag(I * conj(in)) = real(in)
+             * and then return {imag(w), real(w)} which is atan(in)
+             */
+            const realT x = std::imag(in);
+            const realT y = std::real(in);
+            if (std::isnan(x)) {
+                /* atanh(NaN + I*+-Inf) = sign(NaN)*0 + I*+-Pi/2 */
+                if (std::isinf(y)) {
+                    const realT pi_half = sycl::atan(realT(1)) * 2;
+
+                    const realT atanh_re = sycl::copysign(realT(0), x);
+                    const realT atanh_im = sycl::copysign(pi_half, y);
+                    return resT{atanh_im, atanh_re};
+                }
+                /*
+                 * All other cases involving NaN return NaN + I*NaN.
+                 */
+                return resT{q_nan, q_nan};
+            }
+            else if (std::isnan(y)) {
+                /* atanh(+-Inf + I*NaN) = +-0 + I*NaN */
+                if (std::isinf(x)) {
+                    const realT atanh_re = sycl::copysign(realT(0), x);
+                    const realT atanh_im = q_nan;
+                    return resT{atanh_im, atanh_re};
+                }
+                /* atanh(+-0 + I*NaN) = +-0 + I*NaN */
+                if (x == realT(0)) {
+                    return resT{q_nan, x};
+                }
+                /*
+                 * All other cases involving NaN return NaN + I*NaN.
+                 */
+                return resT{q_nan, q_nan};
+            }
+
+            /*
+             * For large x or y including
+             * atanh(+-Inf + I*+-Inf) = 0 + I*+-PI/2
+             * The sign of pi/2 depends on the sign of imaginary part of the
+             * input.
+             */
+            static constexpr realT r_eps =
+                realT(1) / std::numeric_limits<realT>::epsilon();
+            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
+                const realT pi_half = sycl::atan(realT(1)) * 2;
+
+                const realT atanh_re = realT(0);
+                const realT atanh_im = sycl::copysign(pi_half, y);
+                return resT{atanh_im, atanh_re};
+            }
+            /* ordinary cases */
+            return exprm_ns::atan(exprm_ns::complex<realT>(in)); // atan(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::atan(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AtanContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AtanFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AtanStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AtanFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AtanOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct AtanContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class atan_contig_kernel;
+
+template <typename argTy>
+sycl::event atan_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using AtanHS = hyperparam_detail::AtanContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AtanHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AtanHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AtanOutputType, AtanContigFunctor, atan_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AtanContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AtanOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = atan_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AtanTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::atan(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AtanOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class atan_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    atan_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AtanOutputType, AtanStridedFunctor, atan_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AtanStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AtanOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = atan_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::atan
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
new file mode 100644
index 000000000000..220722d5b596
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
@@ -0,0 +1,233 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ATAN2(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::atan2
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT1, typename argT2, typename resT>
+struct Atan2Functor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::false_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if (std::isinf(in2) && !sycl::signbit(in2)) {
+            if (std::isfinite(in1)) {
+                return sycl::copysign(resT(0), in1);
+            }
+        }
+        return sycl::atan2(in1, in2);
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using Atan2ContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            Atan2Functor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using Atan2StridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             Atan2Functor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct Atan2OutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct Atan2ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class atan2_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event atan2_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg1_p,
+                              ssize_t arg1_offset,
+                              const char *arg2_p,
+                              ssize_t arg2_offset,
+                              char *res_p,
+                              ssize_t res_offset,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using Atan2HS =
+        hyperparam_detail::Atan2ContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = Atan2HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Atan2HS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, Atan2OutputType, Atan2ContigFunctor,
+        atan2_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                             arg1_offset, arg2_p, arg2_offset,
+                                             res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct Atan2ContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!Atan2OutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = atan2_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct Atan2TypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::atan2(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Atan2OutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class atan2_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    atan2_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg1_p,
+                       ssize_t arg1_offset,
+                       const char *arg2_p,
+                       ssize_t arg2_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, Atan2OutputType, Atan2StridedFunctor,
+        atan2_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                              arg1_offset, arg2_p, arg2_offset, res_p,
+                              res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct Atan2StridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!Atan2OutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = atan2_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::atan2
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
new file mode 100644
index 000000000000..32f5384f4ad8
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
@@ -0,0 +1,280 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ATANH(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::atanh
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct AtanhFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            if (std::isnan(x)) {
+                /* atanh(NaN + I*+-Inf) = sign(NaN)0 + I*+-PI/2 */
+                if (std::isinf(y)) {
+                    const realT pi_half = sycl::atan(realT(1)) * 2;
+
+                    const realT res_re = sycl::copysign(realT(0), x);
+                    const realT res_im = sycl::copysign(pi_half, y);
+                    return resT{res_re, res_im};
+                }
+                /*
+                 * All other cases involving NaN return NaN + I*NaN.
+                 */
+                return resT{q_nan, q_nan};
+            }
+            else if (std::isnan(y)) {
+                /* atanh(+-Inf + I*NaN) = +-0 + I*NaN */
+                if (std::isinf(x)) {
+                    const realT res_re = sycl::copysign(realT(0), x);
+                    return resT{res_re, q_nan};
+                }
+                /* atanh(+-0 + I*NaN) = +-0 + I*NaN */
+                if (x == realT(0)) {
+                    return resT{x, q_nan};
+                }
+                /*
+                 * All other cases involving NaN return NaN + I*NaN.
+                 */
+                return resT{q_nan, q_nan};
+            }
+
+            /*
+             * For large x or y including
+             * atanh(+-Inf + I*+-Inf) = 0 + I*+-PI/2
+             * The sign of PI/2 depends on the sign of imaginary part of the
+             * input.
+             */
+            const realT RECIP_EPSILON =
+                realT(1) / std::numeric_limits<realT>::epsilon();
+            if (sycl::fabs(x) > RECIP_EPSILON ||
+                sycl::fabs(y) > RECIP_EPSILON) {
+                const realT pi_half = sycl::atan(realT(1)) * 2;
+
+                const realT res_re = realT(0);
+                const realT res_im = sycl::copysign(pi_half, y);
+                return resT{res_re, res_im};
+            }
+            /* ordinary cases */
+            return exprm_ns::atanh(exprm_ns::complex<realT>(in)); // atanh(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::atanh(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using AtanhContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           AtanhFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using AtanhStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, AtanhFunctor<argTy, resTy>>;
+
+template <typename T>
+struct AtanhOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct AtanhContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class atanh_contig_kernel;
+
+template <typename argTy>
+sycl::event atanh_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using AtanhHS = hyperparam_detail::AtanhContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = AtanhHS::vec_sz;
+    static constexpr std::uint8_t n_vec = AtanhHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, AtanhOutputType, AtanhContigFunctor, atanh_contig_kernel, vec_sz,
+        n_vec>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct AtanhContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!AtanhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = atanh_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct AtanhTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::atanh(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename AtanhOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class atanh_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    atanh_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, AtanhOutputType, AtanhStridedFunctor, atanh_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct AtanhStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!AtanhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = atanh_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::atanh
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
new file mode 100644
index 000000000000..dae2e62a76b2
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -0,0 +1,461 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_and(ar1, ar2) operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::bitwise_and
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseAndFunctor
+{
+    static_assert(std::is_same_v<resT, argT1>);
+    static_assert(std::is_same_v<resT, argT2>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_same_v<resT, bool>) {
+            return in1 && in2;
+        }
+        else {
+            return (in1 & in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (in1 && in2);
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            return (in1 & in2);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseAndContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseAndFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseAndStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    BitwiseAndFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct BitwiseAndOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseAndContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_and_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    bitwise_and_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseAndHS =
+        hyperparam_detail::BitwiseAndContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz;
+    static constexpr std::uint8_t n_vec = BitwiseAndHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseAndOutputType, BitwiseAndContigFunctor,
+        bitwise_and_contig_kernel, vec_sz, n_vec>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseAndOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseAndOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_and_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    bitwise_and_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseAndOutputType, BitwiseAndStridedFunctor,
+        bitwise_and_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseAndOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct BitwiseAndInplaceFunctor
+{
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        if constexpr (std::is_same_v<resT, bool>) {
+            res = res && in;
+        }
+        else {
+            res &= in;
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (res && in);
+            res = vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            res &= in;
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseAndInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseAndInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseAndInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseAndInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_and_inplace_contig_kernel;
+
+/* @brief Types supported by in-place bitwise AND */
+template <typename argTy, typename resTy>
+struct BitwiseAndInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct BitwiseAndInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x &= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (BitwiseAndInplaceTypePairSupport<argT,
+                                                       resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_and_inplace_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseAndHS =
+        hyperparam_detail::BitwiseAndContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseAndHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseAndInplaceContigFunctor,
+        bitwise_and_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseAndInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_and_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_and_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseAndInplaceStridedFunctor,
+        bitwise_and_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseAndInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseAndInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_and_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::bitwise_and
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
new file mode 100644
index 000000000000..96da6b9627ab
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
@@ -0,0 +1,231 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of bitwise_invert(x)
+/// function that inverts bits of binary representation of the argument.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::bitwise_invert
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT, typename resT>
+struct BitwiseInvertFunctor
+{
+    static_assert(std::is_same_v<argT, resT>);
+    static_assert(std::is_integral_v<argT> || std::is_same_v<argT, bool>);
+
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::negation<std::is_same<argT, bool>>;
+    using supports_sg_loadstore = typename std::true_type;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (std::is_same_v<argT, bool>) {
+            return !in;
+        }
+        else {
+            return ~in;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        return ~in;
+    }
+};
+
+template <typename argT,
+          typename resT = argT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseInvertContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           BitwiseInvertFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using BitwiseInvertStridedFunctor =
+    elementwise_common::UnaryStridedFunctor<argTy,
+                                            resTy,
+                                            IndexerT,
+                                            BitwiseInvertFunctor<argTy, resTy>>;
+
+template <typename argTy>
+struct BitwiseInvertOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<argTy, bool>,
+        td_ns::TypeMapResultEntry<argTy, std::uint8_t>,
+        td_ns::TypeMapResultEntry<argTy, std::uint16_t>,
+        td_ns::TypeMapResultEntry<argTy, std::uint32_t>,
+        td_ns::TypeMapResultEntry<argTy, std::uint64_t>,
+        td_ns::TypeMapResultEntry<argTy, std::int8_t>,
+        td_ns::TypeMapResultEntry<argTy, std::int16_t>,
+        td_ns::TypeMapResultEntry<argTy, std::int32_t>,
+        td_ns::TypeMapResultEntry<argTy, std::int64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct BitwiseInvertContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class bitwise_invert_contig_kernel;
+
+template <typename argTy>
+sycl::event
+    bitwise_invert_contig_impl(sycl::queue &exec_q,
+                               std::size_t nelems,
+                               const char *arg_p,
+                               char *res_p,
+                               const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseInvertHS =
+        hyperparam_detail::BitwiseInvertContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = BitwiseInvertHS::vec_sz;
+    static constexpr std::uint8_t n_vec = BitwiseInvertHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, BitwiseInvertOutputType, BitwiseInvertContigFunctor,
+        bitwise_invert_contig_kernel, vec_sz, n_vec>(exec_q, nelems, arg_p,
+                                                     res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct BitwiseInvertContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseInvertOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_invert_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct BitwiseInvertTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::logical_not(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseInvertOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class bitwise_invert_strided_kernel;
+
+template <typename argTy>
+sycl::event bitwise_invert_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, BitwiseInvertOutputType, BitwiseInvertStridedFunctor,
+        bitwise_invert_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
+                                       arg_p, arg_offset, res_p, res_offset,
+                                       depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct BitwiseInvertStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseInvertOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_invert_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::bitwise_invert
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
new file mode 100644
index 000000000000..59279a803ed8
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -0,0 +1,481 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_left_shift(ar1, ar2)
+/// operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::bitwise_left_shift
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseLeftShiftFunctor
+{
+    static_assert(std::is_integral_v<argT1>);
+    static_assert(std::is_integral_v<argT2>);
+    static_assert(!std::is_same_v<argT1, bool>);
+    static_assert(!std::is_same_v<argT2, bool>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return impl(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        sycl::vec<resT, vec_sz> res;
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            res[i] = impl(in1[i], in2[i]);
+        }
+        return res;
+    }
+
+private:
+    resT impl(const argT1 &in1, const argT2 &in2) const
+    {
+        static constexpr argT2 in1_bitsize =
+            static_cast<argT2>(sizeof(argT1) * 8);
+        static constexpr resT zero = resT(0);
+
+        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
+        // array API spec mandates 0
+        if constexpr (std::is_unsigned_v<argT2>) {
+            return (in2 < in1_bitsize) ? (in1 << in2) : zero;
+        }
+        else {
+            return (in2 < argT2(0))
+                       ? zero
+                       : ((in2 < in1_bitsize) ? (in1 << in2) : zero);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseLeftShiftContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseLeftShiftFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseLeftShiftStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    BitwiseLeftShiftFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct BitwiseLeftShiftOutputType
+{
+    using ResT = T1;
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseLeftShiftContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_left_shift_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    bitwise_left_shift_contig_impl(sycl::queue &exec_q,
+                                   std::size_t nelems,
+                                   const char *arg1_p,
+                                   ssize_t arg1_offset,
+                                   const char *arg2_p,
+                                   ssize_t arg2_offset,
+                                   char *res_p,
+                                   ssize_t res_offset,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseLSHS =
+        hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet<argTy1,
+                                                                   argTy2>;
+    static constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseLeftShiftOutputType,
+        BitwiseLeftShiftContigFunctor, bitwise_left_shift_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+                res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseLeftShiftOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_left_shift_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseLeftShiftOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_left_shift_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event bitwise_left_shift_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg1_p,
+    ssize_t arg1_offset,
+    const char *arg2_p,
+    ssize_t arg2_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseLeftShiftOutputType,
+        BitwiseLeftShiftStridedFunctor, bitwise_left_shift_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseLeftShiftOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_left_shift_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct BitwiseLeftShiftInplaceFunctor
+{
+    static_assert(std::is_integral_v<argT>);
+    static_assert(!std::is_same_v<argT, bool>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const { impl(res, in); }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            impl(res[i], in[i]);
+        }
+    }
+
+private:
+    void impl(resT &res, const argT &in) const
+    {
+        static constexpr argT res_bitsize = static_cast<argT>(sizeof(resT) * 8);
+        static constexpr resT zero = resT(0);
+
+        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
+        // array API spec mandates 0
+        if constexpr (std::is_unsigned_v<argT>) {
+            (in < res_bitsize) ? (res <<= in) : res = zero;
+        }
+        else {
+            (in < argT(0)) ? res = zero
+                           : ((in < res_bitsize) ? (res <<= in) : res = zero);
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseLeftShiftInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseLeftShiftInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseLeftShiftInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseLeftShiftInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_left_shift_inplace_contig_kernel;
+
+/* @brief Types supported by in-place bitwise left shift */
+template <typename argTy, typename resTy>
+struct BitwiseLeftShiftInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct BitwiseLeftShiftInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x <<= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (BitwiseLeftShiftInplaceTypePairSupport<
+                          argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_left_shift_inplace_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseLSHS =
+        hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet<resTy,
+                                                                   argTy>;
+    static constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseLeftShiftInplaceContigFunctor,
+        bitwise_left_shift_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseLeftShiftInplaceTypePairSupport<T1,
+                                                              T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_left_shift_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_left_shift_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_left_shift_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseLeftShiftInplaceStridedFunctor,
+        bitwise_left_shift_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseLeftShiftInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseLeftShiftInplaceTypePairSupport<T1,
+                                                              T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_left_shift_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::bitwise_left_shift
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
new file mode 100644
index 000000000000..6714f238ffce
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
@@ -0,0 +1,461 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_or(ar1, ar2) operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::bitwise_or
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseOrFunctor
+{
+    static_assert(std::is_same_v<resT, argT1>);
+    static_assert(std::is_same_v<resT, argT2>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_same_v<resT, bool>) {
+            return in1 || in2;
+        }
+        else {
+            return (in1 | in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (in1 || in2);
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            return (in1 | in2);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseOrContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseOrFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseOrStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    BitwiseOrFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct BitwiseOrOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseOrContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_or_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event bitwise_or_contig_impl(sycl::queue &exec_q,
+                                   std::size_t nelems,
+                                   const char *arg1_p,
+                                   ssize_t arg1_offset,
+                                   const char *arg2_p,
+                                   ssize_t arg2_offset,
+                                   char *res_p,
+                                   ssize_t res_offset,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseOrHS =
+        hyperparam_detail::BitwiseOrContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseOrOutputType, BitwiseOrContigFunctor,
+        bitwise_or_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseOrContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseOrOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_or_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseOrTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseOrOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_or_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    bitwise_or_strided_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            int nd,
+                            const ssize_t *shape_and_strides,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends,
+                            const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseOrOutputType, BitwiseOrStridedFunctor,
+        bitwise_or_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseOrStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseOrOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_or_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct BitwiseOrInplaceFunctor
+{
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        if constexpr (std::is_same_v<resT, bool>) {
+            res = res || in;
+        }
+        else {
+            res |= in;
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (res || in);
+            res = vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            res |= in;
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseOrInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseOrInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseOrInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseOrInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_or_inplace_contig_kernel;
+
+/* @brief Types supported by in-place bitwise OR */
+template <typename argTy, typename resTy>
+struct BitwiseOrInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct BitwiseOrInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x |= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (BitwiseOrInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    bitwise_or_inplace_contig_impl(sycl::queue &exec_q,
+                                   std::size_t nelems,
+                                   const char *arg_p,
+                                   ssize_t arg_offset,
+                                   char *res_p,
+                                   ssize_t res_offset,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseOrHS =
+        hyperparam_detail::BitwiseOrContigHyperparameterSet<resTy, argTy>;
+
+    static constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseOrInplaceContigFunctor,
+        bitwise_or_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseOrInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseOrInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_or_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_or_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_or_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseOrInplaceStridedFunctor,
+        bitwise_or_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseOrInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseOrInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_or_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::bitwise_or
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
new file mode 100644
index 000000000000..241852b6a06e
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -0,0 +1,487 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_right_shift(ar1, ar2)
+/// operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::bitwise_right_shift
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseRightShiftFunctor
+{
+    static_assert(std::is_same_v<resT, argT1>);
+    static_assert(std::is_integral_v<argT1>);
+    static_assert(std::is_integral_v<argT2>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return impl(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        sycl::vec<resT, vec_sz> res;
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            res[i] = impl(in1[i], in2[i]);
+        }
+        return res;
+    }
+
+private:
+    resT impl(const argT1 &in1, const argT2 &in2) const
+    {
+        static constexpr argT2 in1_bitsize =
+            static_cast<argT2>(sizeof(argT1) * 8);
+        static constexpr resT zero = resT(0);
+
+        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
+        // array API spec mandates 0
+        if constexpr (std::is_unsigned_v<argT2>) {
+            return (in2 < in1_bitsize) ? (in1 >> in2) : zero;
+        }
+        else {
+            return (in2 < argT2(0))
+                       ? zero
+                       : ((in2 < in1_bitsize)
+                              ? (in1 >> in2)
+                              : (in1 < argT1(0) ? resT(-1) : zero));
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseRightShiftContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseRightShiftFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseRightShiftStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<
+        argT1,
+        argT2,
+        resT,
+        IndexerT,
+        BitwiseRightShiftFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct BitwiseRightShiftOutputType
+{
+    using ResT = T1;
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseRightShiftContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_right_shift_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event bitwise_right_shift_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg1_p,
+    ssize_t arg1_offset,
+    const char *arg2_p,
+    ssize_t arg2_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseRSHS =
+        hyperparam_detail::BitwiseRightShiftContigHyperparameterSet<argTy1,
+                                                                    argTy2>;
+    constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz;
+    constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseRightShiftOutputType,
+        BitwiseRightShiftContigFunctor, bitwise_right_shift_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseRightShiftOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_right_shift_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseRightShiftOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_right_shift_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event bitwise_right_shift_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg1_p,
+    ssize_t arg1_offset,
+    const char *arg2_p,
+    ssize_t arg2_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseRightShiftOutputType,
+        BitwiseRightShiftStridedFunctor, bitwise_right_shift_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseRightShiftOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_right_shift_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct BitwiseRightShiftInplaceFunctor
+{
+    static_assert(std::is_integral_v<argT>);
+    static_assert(!std::is_same_v<argT, bool>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const { impl(res, in); }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            impl(res[i], in[i]);
+        }
+    }
+
+private:
+    void impl(resT &res, const argT &in) const
+    {
+        static constexpr argT res_bitsize = static_cast<argT>(sizeof(resT) * 8);
+        static constexpr resT zero = resT(0);
+
+        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
+        // array API spec mandates 0
+        if constexpr (std::is_unsigned_v<argT>) {
+            (in < res_bitsize) ? (res >>= in) : res = zero;
+        }
+        else {
+            (in < argT(0)) ? res = zero
+                           : ((in < res_bitsize) ? (res >>= in)
+                              : (res < resT(0))  ? res = resT(-1)
+                                                 : res = zero);
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseRightShiftInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseRightShiftInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseRightShiftInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseRightShiftInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_right_shift_inplace_contig_kernel;
+
+/* @brief Types supported by in-place bitwise right shift */
+template <typename argTy, typename resTy>
+struct BitwiseRightShiftInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct BitwiseRightShiftInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x >>= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (BitwiseRightShiftInplaceTypePairSupport<
+                          argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_right_shift_inplace_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseRSHS =
+        hyperparam_detail::BitwiseRightShiftContigHyperparameterSet<resTy,
+                                                                    argTy>;
+
+    // res = OP(res, arg)
+    static constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseRightShiftInplaceContigFunctor,
+        bitwise_right_shift_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<
+                          T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_right_shift_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_right_shift_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_right_shift_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseRightShiftInplaceStridedFunctor,
+        bitwise_right_shift_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseRightShiftInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<
+                          T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_right_shift_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::bitwise_right_shift
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
new file mode 100644
index 000000000000..292cf3f76df6
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
@@ -0,0 +1,465 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise bitwise_xor(ar1, ar2) operation.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::bitwise_xor
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT1, typename argT2, typename resT>
+struct BitwiseXorFunctor
+{
+    static_assert(std::is_same_v<resT, argT1>);
+    static_assert(std::is_same_v<resT, argT2>);
+
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_same_v<resT, bool>) {
+            // (false != false) -> false, (false != true) -> true
+            // (true != false) -> true,  (true != true) -> false
+            return (in1 != in2);
+        }
+        else {
+            return (in1 ^ in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (in1 != in2);
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            return (in1 ^ in2);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseXorContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    BitwiseXorFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using BitwiseXorStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    BitwiseXorFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct BitwiseXorOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct BitwiseXorContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_xor_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    bitwise_xor_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseXorHS =
+        hyperparam_detail::BitwiseXorContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, BitwiseXorOutputType, BitwiseXorContigFunctor,
+        bitwise_xor_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseXorOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_xor_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename BitwiseXorOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class bitwise_xor_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    bitwise_xor_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, BitwiseXorOutputType, BitwiseXorStridedFunctor,
+        bitwise_xor_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseXorOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_xor_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct BitwiseXorInplaceFunctor
+{
+    using supports_sg_loadstore = typename std::true_type;
+    using supports_vec = typename std::true_type;
+
+    void operator()(resT &res, const argT &in) const
+    {
+        if constexpr (std::is_same_v<resT, bool>) {
+            res = (res != in);
+        }
+        else {
+            res ^= in;
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in) const
+    {
+
+        if constexpr (std::is_same_v<resT, bool>) {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            auto tmp = (res != in);
+            res = vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+        else {
+            res ^= in;
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using BitwiseXorInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        BitwiseXorInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using BitwiseXorInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        BitwiseXorInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class bitwise_xor_inplace_contig_kernel;
+
+/* @brief Types supported by in-place bitwise XOR */
+template <typename argTy, typename resTy>
+struct BitwiseXorInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct BitwiseXorInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x ^= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (BitwiseXorInplaceTypePairSupport<argT,
+                                                       resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_xor_inplace_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using BitwiseXorHS =
+        hyperparam_detail::BitwiseXorContigHyperparameterSet<resTy, argTy>;
+
+    static constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, BitwiseXorInplaceContigFunctor,
+        bitwise_xor_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseXorInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_xor_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class bitwise_xor_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event bitwise_xor_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, BitwiseXorInplaceStridedFunctor,
+        bitwise_xor_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct BitwiseXorInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!BitwiseXorInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = bitwise_xor_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::bitwise_xor
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
new file mode 100644
index 000000000000..ae632061571f
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
@@ -0,0 +1,77 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines an implementation of the complex absolute value.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <limits>
+
+#include "sycl_complex.hpp"
+
+namespace dpctl::tensor::kernels::detail
+{
+
+template <typename realT>
+realT cabs(std::complex<realT> const &z)
+{
+    // Special values for cabs( x + y * 1j):
+    //   * If x is either +infinity or -infinity and y is any value
+    //   (including NaN), the result is +infinity.
+    //   * If x is any value (including NaN) and y is either +infinity or
+    //   -infinity, the result is +infinity.
+    //   * If x is either +0 or -0, the result is equal to abs(y).
+    //   * If y is either +0 or -0, the result is equal to abs(x).
+    //   * If x is NaN and y is a finite number, the result is NaN.
+    //   * If x is a finite number and y is NaN, the result is NaN.
+    //   * If x is NaN and y is NaN, the result is NaN.
+
+    const realT x = std::real(z);
+    const realT y = std::imag(z);
+
+    static constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+    static constexpr realT p_inf = std::numeric_limits<realT>::infinity();
+
+    const realT res =
+        std::isinf(x)
+            ? p_inf
+            : ((std::isinf(y)
+                    ? p_inf
+                    : ((std::isnan(x)
+                            ? q_nan
+                            : exprm_ns::abs(exprm_ns::complex<realT>(z))))));
+
+    return res;
+}
+
+} // namespace dpctl::tensor::kernels::detail
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
new file mode 100644
index 000000000000..20fb0ea7bcda
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -0,0 +1,206 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of CBRT(x)
+/// function that computes a cube root.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::cbrt
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT, typename resT>
+struct CbrtFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::true_type;
+
+    resT operator()(const argT &in) const { return sycl::cbrt(in); }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using CbrtContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           CbrtFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using CbrtStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, CbrtFunctor<argTy, resTy>>;
+
+template <typename T>
+struct CbrtOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct CbrtContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class cbrt_contig_kernel;
+
+template <typename argTy>
+sycl::event cbrt_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using CbrtHS = hyperparam_detail::CbrtContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = CbrtHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CbrtHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct CbrtContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!CbrtOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cbrt_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct CbrtTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::cbrt(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CbrtOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class cbrt_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    cbrt_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, CbrtOutputType, CbrtStridedFunctor, cbrt_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct CbrtStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!CbrtOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cbrt_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::cbrt
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
new file mode 100644
index 000000000000..08fd4da2fb50
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
@@ -0,0 +1,230 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of CEIL(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::ceil
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct CeilFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (std::is_integral_v<argT>) {
+            return in;
+        }
+        else {
+            if (in == 0) {
+                return in;
+            }
+            return sycl::ceil(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using CeilContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           CeilFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using CeilStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, CeilFunctor<argTy, resTy>>;
+
+template <typename T>
+struct CeilOutputType
+{
+    using value_type =
+        typename std::disjunction<td_ns::TypeMapResultEntry<T, bool>,
+                                  td_ns::TypeMapResultEntry<T, std::uint8_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint16_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint32_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint64_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int8_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int16_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int32_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int64_t>,
+                                  td_ns::TypeMapResultEntry<T, sycl::half>,
+                                  td_ns::TypeMapResultEntry<T, float>,
+                                  td_ns::TypeMapResultEntry<T, double>,
+                                  td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct CeilContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class ceil_contig_kernel;
+
+template <typename argTy>
+sycl::event ceil_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using CeilHS = hyperparam_detail::CeilContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = CeilHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CeilHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, CeilOutputType, CeilContigFunctor, ceil_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct CeilContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!CeilOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = ceil_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct CeilTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::ceil(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CeilOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class ceil_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    ceil_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, CeilOutputType, CeilStridedFunctor, ceil_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct CeilStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!CeilOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = ceil_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::ceil
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
new file mode 100644
index 000000000000..cfe3f4898491
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -0,0 +1,1036 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines common code for elementwise tensor operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "common_detail.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+
+#include "kernels/alignment.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels::elementwise_common
+{
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+/*! @brief Functor for unary function evaluation on contiguous array */
+template <typename argT,
+          typename resT,
+          typename UnaryOperatorT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+struct UnaryContigFunctor
+{
+private:
+    const argT *in = nullptr;
+    resT *out = nullptr;
+    std::size_t nelems_;
+
+public:
+    UnaryContigFunctor(const argT *inp, resT *res, const std::size_t n_elems)
+        : in(inp), out(res), nelems_(n_elems)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+        UnaryOperatorT op{};
+        /* Each work-item processes vec_sz elements, contiguous in memory */
+        /* NOTE: work-group size must be divisible by sub-group size */
+
+        if constexpr (enable_sg_loadstore &&
+                      UnaryOperatorT::is_constant::value) {
+            // value of operator is known to be a known constant
+            constexpr resT const_val = UnaryOperatorT::constant_value;
+
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+            if (base + elems_per_wi * sgSize < nelems_) {
+                static constexpr sycl::vec<resT, vec_sz> res_vec(const_val);
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = const_val;
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           UnaryOperatorT::supports_sg_loadstore::value &&
+                           UnaryOperatorT::supports_vec::value &&
+                           (vec_sz > 1)) {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    const sycl::vec<argT, vec_sz> x =
+                        sub_group_load<vec_sz>(sg, in_multi_ptr);
+                    const sycl::vec<resT, vec_sz> res_vec = op(x);
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    // scalar call
+                    out[k] = op(in[k]);
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           UnaryOperatorT::supports_sg_loadstore::value &&
+                           std::is_same_v<resT, argT>) {
+            // default: use scalar-value function
+
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    sycl::vec<argT, vec_sz> arg_vec =
+                        sub_group_load<vec_sz>(sg, in_multi_ptr);
+#pragma unroll
+                    for (std::uint32_t k = 0; k < vec_sz; ++k) {
+                        arg_vec[k] = op(arg_vec[k]);
+                    }
+                    sub_group_store<vec_sz>(sg, arg_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = op(in[k]);
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           UnaryOperatorT::supports_sg_loadstore::value) {
+            // default: use scalar-value function
+
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    const sycl::vec<argT, vec_sz> arg_vec =
+                        sub_group_load<vec_sz>(sg, in_multi_ptr);
+                    sycl::vec<resT, vec_sz> res_vec;
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                        res_vec[k] = op(arg_vec[k]);
+                    }
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = op(in[k]);
+                }
+            }
+        }
+        else {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
+
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems_, start + elems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                out[offset] = op(in[offset]);
+            }
+        }
+    }
+};
+
+template <typename argT, typename resT, typename IndexerT, typename UnaryOpT>
+struct UnaryStridedFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    resT *res_ = nullptr;
+    IndexerT inp_out_indexer_;
+
+public:
+    UnaryStridedFunctor(const argT *inp_p,
+                        resT *res_p,
+                        const IndexerT &inp_out_indexer)
+        : inp_(inp_p), res_(res_p), inp_out_indexer_(inp_out_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wid) const
+    {
+        const auto &offsets_ = inp_out_indexer_(wid.get(0));
+        const ssize_t &inp_offset = offsets_.get_first_offset();
+        const ssize_t &res_offset = offsets_.get_second_offset();
+
+        UnaryOpT op{};
+
+        res_[res_offset] = op(inp_[inp_offset]);
+    }
+};
+
+template <typename SizeT>
+SizeT select_lws(const sycl::device &, SizeT n_work_items_needed)
+{
+    // TODO: make the decision based on device descriptors
+
+    // constexpr SizeT few_threshold = (SizeT(1) << 17);
+    static constexpr SizeT med_threshold = (SizeT(1) << 21);
+
+    const SizeT lws =
+        (n_work_items_needed <= med_threshold ? SizeT(128) : SizeT(256));
+
+    return lws;
+}
+
+template <typename argTy,
+          template <typename T> class UnaryOutputType,
+          template <typename A,
+                    typename R,
+                    std::uint8_t vs,
+                    std::uint8_t nv,
+                    bool enable> class ContigFunctorT,
+          template <typename A,
+                    typename R,
+                    std::uint8_t vs,
+                    std::uint8_t nv> class kernel_name,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u>
+sycl::event unary_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+    const std::size_t n_work_items_needed = nelems / elems_per_wi;
+    const std::size_t lws =
+        select_lws(exec_q.get_device(), n_work_items_needed);
+
+    const std::size_t n_groups =
+        ((nelems + lws * elems_per_wi - 1) / (lws * elems_per_wi));
+    const auto gws_range = sycl::range<1>(n_groups * lws);
+    const auto lws_range = sycl::range<1>(lws);
+
+    using resTy = typename UnaryOutputType<argTy>::value_type;
+    using BaseKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        if (is_aligned<required_alignment>(arg_p) &&
+            is_aligned<required_alignment>(res_p)) {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = BaseKernelName;
+            using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                        enable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg_tp, res_tp, nelems));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
+            using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                        disable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg_tp, res_tp, nelems));
+        }
+    });
+
+    return comp_ev;
+}
+
+template <typename argTy,
+          template <typename T> class UnaryOutputType,
+          template <typename A, typename R, typename I> class StridedFunctorT,
+          template <typename A, typename R, typename I> class kernel_name>
+sycl::event
+    unary_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        using resTy = typename UnaryOutputType<argTy>::value_type;
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides};
+
+        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+        using Impl = StridedFunctorT<argTy, resTy, IndexerT>;
+
+        cgh.parallel_for<kernel_name<argTy, resTy, IndexerT>>(
+            {nelems}, Impl(arg_tp, res_tp, indexer));
+    });
+    return comp_ev;
+}
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          typename BinaryOperatorT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+struct BinaryContigFunctor
+{
+private:
+    const argT1 *in1 = nullptr;
+    const argT2 *in2 = nullptr;
+    resT *out = nullptr;
+    std::size_t nelems_;
+
+public:
+    BinaryContigFunctor(const argT1 *inp1,
+                        const argT2 *inp2,
+                        resT *res,
+                        const std::size_t n_elems)
+        : in1(inp1), in2(inp2), out(res), nelems_(n_elems)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
+        BinaryOperatorT op{};
+        /* Each work-item processes vec_sz elements, contiguous in memory */
+        /* NOTE: work-group size must be divisible by sub-group size */
+
+        if constexpr (enable_sg_loadstore &&
+                      BinaryOperatorT::supports_sg_loadstore::value &&
+                      BinaryOperatorT::supports_vec::value && (vec_sz > 1)) {
+            auto sg = ndit.get_sub_group();
+            std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+                sycl::vec<resT, vec_sz> res_vec;
+
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    std::size_t offset = base + it * sgSize;
+                    auto in1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in1[offset]);
+                    auto in2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in2[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    const sycl::vec<argT1, vec_sz> arg1_vec =
+                        sub_group_load<vec_sz>(sg, in1_multi_ptr);
+                    const sycl::vec<argT2, vec_sz> arg2_vec =
+                        sub_group_load<vec_sz>(sg, in2_multi_ptr);
+                    res_vec = op(arg1_vec, arg2_vec);
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = op(in1[k], in2[k]);
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           BinaryOperatorT::supports_sg_loadstore::value) {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto in1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in1[offset]);
+                    auto in2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in2[offset]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[offset]);
+
+                    const sycl::vec<argT1, vec_sz> arg1_vec =
+                        sub_group_load<vec_sz>(sg, in1_multi_ptr);
+                    const sycl::vec<argT2, vec_sz> arg2_vec =
+                        sub_group_load<vec_sz>(sg, in2_multi_ptr);
+
+                    sycl::vec<resT, vec_sz> res_vec;
+#pragma unroll
+                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
+                        res_vec[vec_id] =
+                            op(arg1_vec[vec_id], arg2_vec[vec_id]);
+                    }
+                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    out[k] = op(in1[k], in2[k]);
+                }
+            }
+        }
+        else {
+            const std::size_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+            const std::size_t elems_per_sg = sgSize * elems_per_wi;
+
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems_, start + elems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                out[offset] = op(in1[offset], in2[offset]);
+            }
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          typename ThreeOffsets_IndexerT,
+          typename BinaryOperatorT>
+struct BinaryStridedFunctor
+{
+private:
+    const argT1 *in1 = nullptr;
+    const argT2 *in2 = nullptr;
+    resT *out = nullptr;
+    ThreeOffsets_IndexerT three_offsets_indexer_;
+
+public:
+    BinaryStridedFunctor(const argT1 *inp1_tp,
+                         const argT2 *inp2_tp,
+                         resT *res_tp,
+                         const ThreeOffsets_IndexerT &inps_res_indexer)
+        : in1(inp1_tp), in2(inp2_tp), out(res_tp),
+          three_offsets_indexer_(inps_res_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wid) const
+    {
+        const auto &three_offsets_ =
+            three_offsets_indexer_(static_cast<ssize_t>(wid.get(0)));
+
+        const auto &inp1_offset = three_offsets_.get_first_offset();
+        const auto &inp2_offset = three_offsets_.get_second_offset();
+        const auto &out_offset = three_offsets_.get_third_offset();
+
+        BinaryOperatorT op{};
+        out[out_offset] = op(in1[inp1_offset], in2[inp2_offset]);
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          typename BinaryOperatorT>
+struct BinaryContigMatrixContigRowBroadcastingFunctor
+{
+private:
+    const argT1 *mat;
+    const argT2 *padded_vec;
+    resT *res;
+    std::size_t n_elems;
+    std::size_t n1;
+
+public:
+    BinaryContigMatrixContigRowBroadcastingFunctor(const argT1 *mat_tp,
+                                                   const argT2 *row_tp,
+                                                   resT *res_tp,
+                                                   std::size_t n_elems_in_mat,
+                                                   std::size_t n_elems_in_row)
+        : mat(mat_tp), padded_vec(row_tp), res(res_tp), n_elems(n_elems_in_mat),
+          n1(n_elems_in_row)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        /* NOTE: work-group size must be divisible by sub-group size */
+
+        BinaryOperatorT op{};
+        static_assert(BinaryOperatorT::supports_sg_loadstore::value);
+
+        const auto &sg = ndit.get_sub_group();
+        const std::size_t gid = ndit.get_global_linear_id();
+
+        const std::size_t sgSize = sg.get_max_local_range()[0];
+        const std::size_t base = gid - sg.get_local_id()[0];
+
+        if (base + sgSize < n_elems) {
+            auto in1_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
+
+            auto in2_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&res[base]);
+
+            const argT1 mat_el = sub_group_load(sg, in1_multi_ptr);
+            const argT2 vec_el = sub_group_load(sg, in2_multi_ptr);
+
+            resT res_el = op(mat_el, vec_el);
+
+            sub_group_store(sg, res_el, out_multi_ptr);
+        }
+        else {
+            const std::size_t lane_id = sg.get_local_id()[0];
+            for (std::size_t k = base + lane_id; k < n_elems; k += sgSize) {
+                res[k] = op(mat[k], padded_vec[k % n1]);
+            }
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          typename BinaryOperatorT>
+struct BinaryContigRowContigMatrixBroadcastingFunctor
+{
+private:
+    const argT1 *padded_vec;
+    const argT2 *mat;
+    resT *res;
+    std::size_t n_elems;
+    std::size_t n1;
+
+public:
+    BinaryContigRowContigMatrixBroadcastingFunctor(const argT1 *row_tp,
+                                                   const argT2 *mat_tp,
+                                                   resT *res_tp,
+                                                   std::size_t n_elems_in_mat,
+                                                   std::size_t n_elems_in_row)
+        : padded_vec(row_tp), mat(mat_tp), res(res_tp), n_elems(n_elems_in_mat),
+          n1(n_elems_in_row)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        /* NOTE: work-group size must be divisible by sub-group size */
+        BinaryOperatorT op{};
+        static_assert(BinaryOperatorT::supports_sg_loadstore::value);
+
+        const auto &sg = ndit.get_sub_group();
+        std::size_t gid = ndit.get_global_linear_id();
+
+        const std::size_t sgSize = sg.get_max_local_range()[0];
+        const std::size_t base = gid - sg.get_local_id()[0];
+
+        if (base + sgSize < n_elems) {
+            auto in1_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto in2_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&res[base]);
+
+            const argT2 mat_el = sub_group_load(sg, in2_multi_ptr);
+            const argT1 vec_el = sub_group_load(sg, in1_multi_ptr);
+
+            resT res_el = op(vec_el, mat_el);
+
+            sub_group_store(sg, res_el, out_multi_ptr);
+        }
+        else {
+            const std::size_t lane_id = sg.get_local_id()[0];
+            for (std::size_t k = base + lane_id; k < n_elems; k += sgSize) {
+                res[k] = op(padded_vec[k % n1], mat[k]);
+            }
+        }
+    }
+};
+
+// Typedefs for function pointers
+
+typedef sycl::event (*unary_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*unary_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    ssize_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::vector<sycl::event> &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    ssize_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::vector<sycl::event> &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    ssize_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename argTy1,
+          typename argTy2,
+          template <typename T1, typename T2> class BinaryOutputType,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    std::uint8_t vs,
+                    std::uint8_t nv,
+                    bool enable_sg_loadstore> class BinaryContigFunctorT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    std::uint8_t vs,
+                    std::uint8_t nv> class kernel_name,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u>
+sycl::event binary_contig_impl(sycl::queue &exec_q,
+                               std::size_t nelems,
+                               const char *arg1_p,
+                               ssize_t arg1_offset,
+                               const char *arg2_p,
+                               ssize_t arg2_offset,
+                               char *res_p,
+                               ssize_t res_offset,
+                               const std::vector<sycl::event> &depends = {})
+{
+    const std::size_t n_work_items_needed = nelems / (n_vecs * vec_sz);
+    const std::size_t lws =
+        select_lws(exec_q.get_device(), n_work_items_needed);
+
+    const std::size_t n_groups =
+        ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+    const auto gws_range = sycl::range<1>(n_groups * lws);
+    const auto lws_range = sycl::range<1>(lws);
+
+    using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
+    using BaseKernelName = kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>;
+
+    const argTy1 *arg1_tp =
+        reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
+    const argTy2 *arg2_tp =
+        reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        if (is_aligned<required_alignment>(arg1_tp) &&
+            is_aligned<required_alignment>(arg2_tp) &&
+            is_aligned<required_alignment>(res_tp)) {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = BaseKernelName;
+            using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
+                                              n_vecs, enable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg1_tp, arg2_tp, res_tp, nelems));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
+            using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
+                                              n_vecs, disable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg1_tp, arg2_tp, res_tp, nelems));
+        }
+    });
+    return comp_ev;
+}
+
+template <typename argTy1,
+          typename argTy2,
+          template <typename T1, typename T2> class BinaryOutputType,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename IndT> class BinaryStridedFunctorT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename IndT> class kernel_name>
+sycl::event
+    binary_strided_impl(sycl::queue &exec_q,
+                        std::size_t nelems,
+                        int nd,
+                        const ssize_t *shape_and_strides,
+                        const char *arg1_p,
+                        ssize_t arg1_offset,
+                        const char *arg2_p,
+                        ssize_t arg2_offset,
+                        char *res_p,
+                        ssize_t res_offset,
+                        const std::vector<sycl::event> &depends,
+                        const std::vector<sycl::event> &additional_depends)
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
+
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+
+        const IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset,
+                               shape_and_strides};
+
+        const argTy1 *arg1_tp = reinterpret_cast<const argTy1 *>(arg1_p);
+        const argTy2 *arg2_tp = reinterpret_cast<const argTy2 *>(arg2_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
+
+        using Impl = BinaryStridedFunctorT<argTy1, argTy2, resTy, IndexerT>;
+
+        cgh.parallel_for<kernel_name<argTy1, argTy2, resTy, IndexerT>>(
+            {nelems}, Impl(arg1_tp, arg2_tp, res_tp, indexer));
+    });
+    return comp_ev;
+}
+
+template <
+    typename argT1,
+    typename argT2,
+    typename resT,
+    template <typename T1,
+              typename T2,
+              typename T3> class BinaryContigMatrixContigRowBroadcastFunctorT,
+    template <typename T1, typename T2, typename T3> class kernel_name>
+sycl::event binary_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = op(mat[i,j], vec[j])
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    const argT1 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
+    const argT2 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
+    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
+
+    const auto &dev = exec_q.get_device();
+    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
+    // Get device-specific kernel info max_sub_group_size
+    std::size_t max_sgSize =
+        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+
+    std::size_t n1_padded = n1 + max_sgSize;
+    auto padded_vec_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<argT2>(n1_padded,
+                                                               exec_q);
+    argT2 *padded_vec = padded_vec_owner.get();
+
+    sycl::event make_padded_vec_ev =
+        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
+            argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends);
+
+    // sub-group spans work-items [I, I + sgSize)
+    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
+    // Generically, sub_group_load( &mat[base]) may load arrays from
+    // different rows of mat. The start corresponds to row (base / n0)
+    // We read sub_group_load(&padded_vec[(base / n0)]).
+    // The vector is padded to ensure that reads are accessible
+
+    const std::size_t lws = 128;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(make_padded_vec_ev);
+
+        auto lwsRange = sycl::range<1>(lws);
+        std::size_t n_elems = n0 * n1;
+        std::size_t n_groups = (n_elems + lws - 1) / lws;
+        auto gwsRange = sycl::range<1>(n_groups * lws);
+
+        using Impl =
+            BinaryContigMatrixContigRowBroadcastFunctorT<argT1, argT2, resT>;
+
+        cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
+            sycl::nd_range<1>(gwsRange, lwsRange),
+            Impl(mat, padded_vec, res, n_elems, n1));
+    });
+
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, padded_vec_owner);
+
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return comp_ev;
+}
+
+template <
+    typename argT1,
+    typename argT2,
+    typename resT,
+    template <typename T1,
+              typename T2,
+              typename T3> class BinaryContigRowContigMatrixBroadcastFunctorT,
+    template <typename T1, typename T2, typename T3> class kernel_name>
+sycl::event binary_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = op(vec[j], mat[i,j])
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    const argT1 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
+    const argT2 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
+    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
+
+    const auto &dev = exec_q.get_device();
+    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
+    // Get device-specific kernel info max_sub_group_size
+    std::size_t max_sgSize =
+        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+
+    std::size_t n1_padded = n1 + max_sgSize;
+    auto padded_vec_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<argT2>(n1_padded,
+                                                               exec_q);
+    argT2 *padded_vec = padded_vec_owner.get();
+
+    sycl::event make_padded_vec_ev =
+        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
+            argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends);
+
+    // sub-group spans work-items [I, I + sgSize)
+    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
+    // Generically, sub_group_load( &mat[base]) may load arrays from
+    // different rows of mat. The start corresponds to row (base / n0)
+    // We read sub_group_load(&padded_vec[(base / n0)]). The vector is
+    // padded to ensure that reads are accessible
+
+    const std::size_t lws = 128;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(make_padded_vec_ev);
+
+        auto lwsRange = sycl::range<1>(lws);
+        std::size_t n_elems = n0 * n1;
+        std::size_t n_groups = (n_elems + lws - 1) / lws;
+        auto gwsRange = sycl::range<1>(n_groups * lws);
+
+        using Impl =
+            BinaryContigRowContigMatrixBroadcastFunctorT<argT1, argT2, resT>;
+
+        cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
+            sycl::nd_range<1>(gwsRange, lwsRange),
+            Impl(padded_vec, mat, res, n_elems, n1));
+    });
+
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, padded_vec_owner);
+
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return comp_ev;
+};
+} // namespace dpctl::tensor::kernels::elementwise_common
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
new file mode 100644
index 000000000000..68d025ec6307
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
@@ -0,0 +1,69 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines common code for elementwise tensor operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::kernels::elementwise_detail
+{
+template <typename T>
+class populate_padded_vec_krn;
+
+template <typename T>
+sycl::event
+    populate_padded_vector(sycl::queue &exec_q,
+                           const T *vec,
+                           std::size_t vec_sz,
+                           T *padded_vec,
+                           size_t padded_vec_sz,
+                           const std::vector<sycl::event> &dependent_events)
+{
+    sycl::event populate_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
+        // ensure vec contains actual data
+        cgh.depends_on(dependent_events);
+
+        sycl::range<1> gRange{padded_vec_sz};
+
+        cgh.parallel_for<class populate_padded_vec_krn<T>>(
+            gRange, [=](sycl::id<1> id) {
+                std::size_t i = id[0];
+                padded_vec[i] = vec[i % vec_sz];
+            });
+    });
+
+    return populate_padded_vec_ev;
+}
+} // namespace dpctl::tensor::kernels::elementwise_detail
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
new file mode 100644
index 000000000000..61902fce888a
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -0,0 +1,476 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines common code for in-place elementwise tensor operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+
+#include "kernels/alignment.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common_detail.hpp"
+
+namespace dpctl::tensor::kernels::elementwise_common
+{
+
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename argT,
+          typename resT,
+          typename BinaryInplaceOperatorT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+struct BinaryInplaceContigFunctor
+{
+private:
+    const argT *rhs = nullptr;
+    resT *lhs = nullptr;
+    std::size_t nelems_;
+
+public:
+    BinaryInplaceContigFunctor(const argT *rhs_tp,
+                               resT *lhs_tp,
+                               const std::size_t n_elems)
+        : rhs(rhs_tp), lhs(lhs_tp), nelems_(n_elems)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        BinaryInplaceOperatorT op{};
+        static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
+        /* Each work-item processes vec_sz elements, contiguous in memory */
+        /* NB: Workgroup size must be divisible by sub-group size */
+
+        if constexpr (enable_sg_loadstore &&
+                      BinaryInplaceOperatorT::supports_sg_loadstore::value &&
+                      BinaryInplaceOperatorT::supports_vec::value &&
+                      (vec_sz > 1)) {
+            auto sg = ndit.get_sub_group();
+            std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto rhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&rhs[offset]);
+                    auto lhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&lhs[offset]);
+
+                    const sycl::vec<argT, vec_sz> &arg_vec =
+                        sub_group_load<vec_sz>(sg, rhs_multi_ptr);
+                    sycl::vec<resT, vec_sz> res_vec =
+                        sub_group_load<vec_sz>(sg, lhs_multi_ptr);
+                    op(res_vec, arg_vec);
+
+                    sub_group_store<vec_sz>(sg, res_vec, lhs_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    op(lhs[k], rhs[k]);
+                }
+            }
+        }
+        else if constexpr (enable_sg_loadstore &&
+                           BinaryInplaceOperatorT::supports_sg_loadstore::
+                               value) {
+            auto sg = ndit.get_sub_group();
+            std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            std::size_t base =
+                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                sg.get_group_id()[0] * sgSize);
+
+            if (base + elems_per_wi * sgSize < nelems_) {
+#pragma unroll
+                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
+                    const std::size_t offset = base + it * sgSize;
+                    auto rhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&rhs[offset]);
+                    auto lhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&lhs[offset]);
+
+                    const sycl::vec<argT, vec_sz> arg_vec =
+                        sub_group_load<vec_sz>(sg, rhs_multi_ptr);
+                    sycl::vec<resT, vec_sz> res_vec =
+                        sub_group_load<vec_sz>(sg, lhs_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
+                        op(res_vec[vec_id], arg_vec[vec_id]);
+                    }
+                    sub_group_store<vec_sz>(sg, res_vec, lhs_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
+                    op(lhs[k], rhs[k]);
+                }
+            }
+        }
+        else {
+            const std::size_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+            const std::size_t elems_per_sg = elems_per_wi * sgSize;
+
+            const std::size_t start =
+                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems_, start + elems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                op(lhs[offset], rhs[offset]);
+            }
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          typename TwoOffsets_IndexerT,
+          typename BinaryInplaceOperatorT>
+struct BinaryInplaceStridedFunctor
+{
+private:
+    const argT *rhs = nullptr;
+    resT *lhs = nullptr;
+    TwoOffsets_IndexerT two_offsets_indexer_;
+
+public:
+    BinaryInplaceStridedFunctor(const argT *rhs_tp,
+                                resT *lhs_tp,
+                                const TwoOffsets_IndexerT &inp_res_indexer)
+        : rhs(rhs_tp), lhs(lhs_tp), two_offsets_indexer_(inp_res_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wid) const
+    {
+        const auto &two_offsets_ =
+            two_offsets_indexer_(static_cast<ssize_t>(wid.get(0)));
+
+        const auto &inp_offset = two_offsets_.get_first_offset();
+        const auto &lhs_offset = two_offsets_.get_second_offset();
+
+        BinaryInplaceOperatorT op{};
+        op(lhs[lhs_offset], rhs[inp_offset]);
+    }
+};
+
+template <typename argT, typename resT, typename BinaryOperatorT>
+struct BinaryInplaceRowMatrixBroadcastingFunctor
+{
+private:
+    const argT *padded_vec;
+    resT *mat;
+    std::size_t n_elems;
+    std::size_t n1;
+
+public:
+    BinaryInplaceRowMatrixBroadcastingFunctor(const argT *row_tp,
+                                              resT *mat_tp,
+                                              std::size_t n_elems_in_mat,
+                                              std::size_t n_elems_in_row)
+        : padded_vec(row_tp), mat(mat_tp), n_elems(n_elems_in_mat),
+          n1(n_elems_in_row)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        /* Workgroup size is expected to be a multiple of sub-group size */
+        BinaryOperatorT op{};
+        static_assert(BinaryOperatorT::supports_sg_loadstore::value);
+
+        auto sg = ndit.get_sub_group();
+        const std::size_t gid = ndit.get_global_linear_id();
+
+        std::uint8_t sgSize = sg.get_max_local_range()[0];
+        std::size_t base = gid - sg.get_local_id()[0];
+
+        if (base + sgSize < n_elems) {
+            auto in_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
+
+            const argT vec_el = sub_group_load(sg, in_multi_ptr);
+            resT mat_el = sub_group_load(sg, out_multi_ptr);
+
+            op(mat_el, vec_el);
+
+            sub_group_store(sg, mat_el, out_multi_ptr);
+        }
+        else {
+            const std::size_t start = base + sg.get_local_id()[0];
+            for (std::size_t k = start; k < n_elems; k += sgSize) {
+                op(mat[k], padded_vec[k % n1]);
+            }
+        }
+    }
+};
+
+// Typedefs for function pointers
+
+typedef sycl::event (*binary_inplace_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_inplace_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+typedef sycl::event (*binary_inplace_row_matrix_broadcast_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::vector<sycl::event> &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    ssize_t,
+    char *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename argTy,
+          typename resTy,
+          template <typename T1,
+                    typename T2,
+                    std::uint8_t vs,
+                    std::uint8_t nv,
+                    bool enable_sg_loadstore> class BinaryInplaceContigFunctorT,
+          template <typename T1,
+                    typename T2,
+                    std::uint8_t vs,
+                    std::uint8_t nv> class kernel_name,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u>
+sycl::event
+    binary_inplace_contig_impl(sycl::queue &exec_q,
+                               std::size_t nelems,
+                               const char *rhs_p,
+                               ssize_t rhs_offset,
+                               char *lhs_p,
+                               ssize_t lhs_offset,
+                               const std::vector<sycl::event> &depends = {})
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lws = 128;
+        const std::size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        const argTy *arg_tp =
+            reinterpret_cast<const argTy *>(rhs_p) + rhs_offset;
+        resTy *res_tp = reinterpret_cast<resTy *>(lhs_p) + lhs_offset;
+
+        if (is_aligned<required_alignment>(arg_tp) &&
+            is_aligned<required_alignment>(res_tp)) {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+            using Impl =
+                BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                            enable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg_tp, res_tp, nelems));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = true;
+            using InnerKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+            using Impl =
+                BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs,
+                                            disable_sg_loadstore>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                Impl(arg_tp, res_tp, nelems));
+        }
+    });
+    return comp_ev;
+}
+
+template <typename argTy,
+          typename resTy,
+          template <typename T1,
+                    typename T2,
+                    typename IndT> class BinaryInplaceStridedFunctorT,
+          template <typename T1, typename T2, typename IndT> class kernel_name>
+sycl::event binary_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *rhs_p,
+    ssize_t rhs_offset,
+    char *lhs_p,
+    ssize_t lhs_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const IndexerT indexer{nd, rhs_offset, lhs_offset, shape_and_strides};
+
+        const argTy *arg_tp = reinterpret_cast<const argTy *>(rhs_p);
+        resTy *res_tp = reinterpret_cast<resTy *>(lhs_p);
+
+        using Impl = BinaryInplaceStridedFunctorT<argTy, resTy, IndexerT>;
+
+        cgh.parallel_for<kernel_name<argTy, resTy, IndexerT>>(
+            {nelems}, Impl(arg_tp, res_tp, indexer));
+    });
+    return comp_ev;
+}
+
+template <typename argT,
+          typename resT,
+          template <typename T1,
+                    typename T3> class BinaryInplaceRowMatrixBroadcastFunctorT,
+          template <typename T1, typename T3> class kernel_name>
+sycl::event binary_inplace_row_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    const argT *vec = reinterpret_cast<const argT *>(vec_p) + vec_offset;
+    resT *mat = reinterpret_cast<resT *>(mat_p) + mat_offset;
+
+    const auto &dev = exec_q.get_device();
+    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
+    // Get device-specific kernel info max_sub_group_size
+    std::size_t max_sgSize =
+        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+
+    std::size_t n1_padded = n1 + max_sgSize;
+    auto padded_vec_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<argT>(n1_padded,
+                                                              exec_q);
+    argT *padded_vec = padded_vec_owner.get();
+
+    sycl::event make_padded_vec_ev =
+        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
+            argT>(exec_q, vec, n1, padded_vec, n1_padded, depends);
+
+    // sub-group spans work-items [I, I + sgSize)
+    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
+    // Generically, sub_group_load( &mat[base]) may load arrays from
+    // different rows of mat. The start corresponds to row (base / n0)
+    // We read sub_group_load(&padded_vec[(base / n0)]). The vector is
+    // padded to ensure that reads are accessible
+
+    const std::size_t lws = 128;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(make_padded_vec_ev);
+
+        auto lwsRange = sycl::range<1>(lws);
+        std::size_t n_elems = n0 * n1;
+        std::size_t n_groups = (n_elems + lws - 1) / lws;
+        auto gwsRange = sycl::range<1>(n_groups * lws);
+
+        using Impl = BinaryInplaceRowMatrixBroadcastFunctorT<argT, resT>;
+
+        cgh.parallel_for<class kernel_name<argT, resT>>(
+            sycl::nd_range<1>(gwsRange, lwsRange),
+            Impl(padded_vec, mat, n_elems, n1));
+    });
+
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, padded_vec_owner);
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return comp_ev;
+}
+
+} // namespace dpctl::tensor::kernels::elementwise_common
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
new file mode 100644
index 000000000000..2c965b236c87
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
@@ -0,0 +1,234 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of CONJ(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::conj
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct ConjFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using rT = typename argT::value_type;
+
+            return exprm_ns::conj(exprm_ns::complex<rT>(in)); // conj(in);
+        }
+        else {
+            if constexpr (!std::is_same_v<argT, bool>)
+                static_assert(std::is_same_v<resT, argT>);
+            return in;
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using ConjContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ConjFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using ConjStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, ConjFunctor<argTy, resTy>>;
+
+template <typename T>
+struct ConjOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, bool, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct ConjContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class conj_contig_kernel;
+
+template <typename argTy>
+sycl::event conj_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using ConjHS = hyperparam_detail::ConjContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = ConjHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = ConjHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, ConjOutputType, ConjContigFunctor, conj_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct ConjContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!ConjOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = conj_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct ConjTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::conj(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename ConjOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class conj_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    conj_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, ConjOutputType, ConjStridedFunctor, conj_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct ConjStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!ConjOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = conj_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::conj
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
new file mode 100644
index 000000000000..c2eb0f7e850e
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
@@ -0,0 +1,248 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of COPYSIGN(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::copysign
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct CopysignFunctor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return sycl::copysign(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = sycl::copysign(in1, in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using CopysignContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            CopysignFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using CopysignStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    CopysignFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct CopysignOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct CopysignContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class copysign_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event copysign_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg1_p,
+                                 ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 ssize_t arg2_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using CopySignHS =
+        hyperparam_detail::CopysignContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = CopySignHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CopySignHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, CopysignOutputType, CopysignContigFunctor,
+        copysign_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct CopysignContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!CopysignOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = copysign_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct CopysignTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CopysignOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class copysign_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    copysign_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg1_p,
+                          ssize_t arg1_offset,
+                          const char *arg2_p,
+                          ssize_t arg2_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, CopysignOutputType, CopysignStridedFunctor,
+        copysign_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                 arg1_offset, arg2_p, arg2_offset, res_p,
+                                 res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct CopysignStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!CopysignOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = copysign_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::copysign
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
new file mode 100644
index 000000000000..7bd47d54778b
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
@@ -0,0 +1,311 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of COS(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::cos
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct CosFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            realT const &in_re = std::real(in);
+            realT const &in_im = std::imag(in);
+
+            const bool in_re_finite = std::isfinite(in_re);
+            const bool in_im_finite = std::isfinite(in_im);
+
+            /*
+             * Handle the nearly-non-exceptional cases where
+             * real and imaginary parts of input are finite.
+             */
+            if (in_re_finite && in_im_finite) {
+                return exprm_ns::cos(exprm_ns::complex<realT>(in)); // cos(in);
+            }
+
+            /*
+             * since cos(in) = cosh(I * in), for special cases,
+             * we return cosh(I * in).
+             */
+            const realT x = -in_im;
+            const realT y = in_re;
+
+            const bool xfinite = in_im_finite;
+            const bool yfinite = in_re_finite;
+            /*
+             * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as dNaN.
+             *
+             * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as d(NaN).
+             */
+            if (x == realT(0) && !yfinite) {
+                const realT y_m_y = (y - y);
+                const realT res_im = sycl::copysign(realT(0), x * y_m_y);
+                return resT{y_m_y, res_im};
+            }
+
+            /*
+             * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
+             *
+             * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
+             * The sign of 0 in the result is unspecified.
+             */
+            if (y == realT(0) && !xfinite) {
+                const realT res_im = sycl::copysign(realT(0), x) * y;
+                return resT{x * x, res_im};
+            }
+
+            /*
+             * cosh(x +- I Inf) = dNaN + I dNaN.
+             *
+             * cosh(x + I NaN) = d(NaN) + I d(NaN).
+             */
+            if (xfinite && !yfinite) {
+                const realT y_m_y = (y - y);
+                return resT{y_m_y, x * y_m_y};
+            }
+
+            /*
+             * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
+             *
+             * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
+             * The sign of Inf in the result is unspecified.  Choice = always +.
+             *
+             * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
+             */
+            if (std::isinf(x)) {
+                if (!yfinite) {
+                    return resT{x * x, sycl::copysign(q_nan, x)};
+                }
+                return resT{(x * x) * sycl::cos(y), x * sycl::sin(y)};
+            }
+
+            /*
+             * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
+             *
+             * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
+             *
+             * cosh(NaN + I y)    = d(NaN) + I d(NaN).
+             */
+            return resT{(x * x) * q_nan, (x + x) * q_nan};
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::cos(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using CosContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           CosFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using CosStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, CosFunctor<argTy, resTy>>;
+
+template <typename T>
+struct CosOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct CosContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class cos_contig_kernel;
+
+template <typename argTy>
+sycl::event cos_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using CosHS = hyperparam_detail::CosContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = CosHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CosHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, CosOutputType, CosContigFunctor, cos_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct CosContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!CosOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cos_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct CosTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::cos(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CosOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class cos_strided_kernel;
+
+template <typename argTy>
+sycl::event cos_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, CosOutputType, CosStridedFunctor, cos_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct CosStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!CosOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cos_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::cos
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
new file mode 100644
index 000000000000..505eb5fffc29
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
@@ -0,0 +1,301 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of COSH(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::cosh
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct CoshFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            const bool xfinite = std::isfinite(x);
+            const bool yfinite = std::isfinite(y);
+
+            /*
+             * Handle the nearly-non-exceptional cases where
+             * real and imaginary parts of input are finite.
+             */
+            if (xfinite && yfinite) {
+                return exprm_ns::cosh(
+                    exprm_ns::complex<realT>(in)); // cosh(in);
+            }
+
+            /*
+             * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as dNaN.
+             *
+             * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as d(NaN).
+             */
+            if (x == realT(0) && !yfinite) {
+                const realT res_im = sycl::copysign(realT(0), x * q_nan);
+                return resT{q_nan, res_im};
+            }
+
+            /*
+             * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
+             *
+             * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
+             * The sign of 0 in the result is unspecified.
+             */
+            if (y == realT(0) && !xfinite) {
+                const realT res_im = sycl::copysign(realT(0), x) * y;
+                return resT{x * x, res_im};
+            }
+
+            /*
+             * cosh(x +- I Inf) = dNaN + I dNaN.
+             *
+             * cosh(x + I NaN) = d(NaN) + I d(NaN).
+             */
+            if (xfinite && !yfinite) {
+                return resT{q_nan, x * q_nan};
+            }
+
+            /*
+             * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
+             *
+             * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
+             * The sign of Inf in the result is unspecified.  Choice = always +.
+             *
+             * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
+             */
+            if (std::isinf(x)) {
+                if (!yfinite) {
+                    return resT{x * x, x * q_nan};
+                }
+                return resT{(x * x) * sycl::cos(y), x * sycl::sin(y)};
+            }
+
+            /*
+             * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
+             *
+             * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
+             *
+             * cosh(NaN + I y)    = d(NaN) + I d(NaN).
+             */
+            return resT{(x * x) * (y - y), (x + x) * (y - y)};
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::cosh(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using CoshContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           CoshFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using CoshStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, CoshFunctor<argTy, resTy>>;
+
+template <typename T>
+struct CoshOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct CoshContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class cosh_contig_kernel;
+
+template <typename argTy>
+sycl::event cosh_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using CoshHS = hyperparam_detail::CoshContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = CoshHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CoshHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, CoshOutputType, CoshContigFunctor, cosh_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct CoshContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!CoshOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cosh_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct CoshTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::cosh(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CoshOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class cosh_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    cosh_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, CoshOutputType, CoshStridedFunctor, cosh_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct CoshStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!CoshOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cosh_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::cosh
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
new file mode 100644
index 000000000000..07b3566c5cef
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
@@ -0,0 +1,316 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of equality of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::equal
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct EqualFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value) {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<realT1>(in1) ==
+                   exprm_ns::complex<realT2>(in2);
+        }
+        else {
+            if constexpr (std::is_integral_v<argT1> &&
+                          std::is_integral_v<argT2> &&
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
+                if constexpr (std::is_signed_v<argT1> &&
+                              !std::is_signed_v<argT2>) {
+                    return (in1 < 0) ? false : (static_cast<argT2>(in1) == in2);
+                }
+                else {
+                    if constexpr (!std::is_signed_v<argT1> &&
+                                  std::is_signed_v<argT2>) {
+                        return (in2 < 0) ? false
+                                         : (in1 == static_cast<argT1>(in2));
+                    }
+                }
+            }
+            else {
+                return (in1 == in2);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = (in1 == in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using EqualContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            EqualFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using EqualStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             EqualFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct EqualOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct EqualContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class equal_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event equal_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg1_p,
+                              ssize_t arg1_offset,
+                              const char *arg2_p,
+                              ssize_t arg2_offset,
+                              char *res_p,
+                              ssize_t res_offset,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using EqualHS =
+        hyperparam_detail::EqualContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = EqualHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = EqualHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, EqualOutputType, EqualContigFunctor,
+        equal_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                             arg1_offset, arg2_p, arg2_offset,
+                                             res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct EqualContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!EqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = equal_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct EqualTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()==(x, y), always bool */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename EqualOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class equal_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    equal_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg1_p,
+                       ssize_t arg1_offset,
+                       const char *arg2_p,
+                       ssize_t arg2_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, EqualOutputType, EqualStridedFunctor,
+        equal_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                              arg1_offset, arg2_p, arg2_offset, res_p,
+                              res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct EqualStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!EqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = equal_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::equal
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
new file mode 100644
index 000000000000..97789e53bb5a
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
@@ -0,0 +1,269 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of EXP(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::exp
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct ExpFunctor
+{
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+            if (std::isfinite(x)) {
+                if (std::isfinite(y)) {
+                    return exprm_ns::exp(
+                        exprm_ns::complex<realT>(in)); // exp(in);
+                }
+                else {
+                    return resT{q_nan, q_nan};
+                }
+            }
+            else if (std::isnan(x)) {
+                /* x is nan */
+                if (y == realT(0)) {
+                    return resT{in};
+                }
+                else {
+                    return resT{x, q_nan};
+                }
+            }
+            else {
+                if (!sycl::signbit(x)) { /* x is +inf */
+                    if (y == realT(0)) {
+                        return resT{x, y};
+                    }
+                    else if (std::isfinite(y)) {
+                        return resT{x * sycl::cos(y), x * sycl::sin(y)};
+                    }
+                    else {
+                        /* x = +inf, y = +-inf || nan */
+                        return resT{x, q_nan};
+                    }
+                }
+                else { /* x is -inf */
+                    if (std::isfinite(y)) {
+                        realT exp_x = sycl::exp(x);
+                        return resT{exp_x * sycl::cos(y), exp_x * sycl::sin(y)};
+                    }
+                    else {
+                        /* x = -inf, y = +-inf || nan */
+                        return resT{0, 0};
+                    }
+                }
+            }
+        }
+        else {
+            return sycl::exp(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using ExpContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ExpFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using ExpStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, ExpFunctor<argTy, resTy>>;
+
+template <typename T>
+struct ExpOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct ExpContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class exp_contig_kernel;
+
+template <typename argTy>
+sycl::event exp_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using ExpHS = hyperparam_detail::ExpContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = ExpHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = ExpHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, ExpOutputType, ExpContigFunctor, exp_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct ExpContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!ExpOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct ExpTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::exp(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename ExpOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class exp_strided_kernel;
+
+template <typename argTy>
+sycl::event exp_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, ExpOutputType, ExpStridedFunctor, exp_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct ExpStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!ExpOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::exp
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
new file mode 100644
index 000000000000..dd09f4eee342
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
@@ -0,0 +1,272 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of EXP2(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::exp2
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct Exp2Functor
+{
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            const argT tmp = in * sycl::log(realT(2));
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(tmp);
+            const realT y = std::imag(tmp);
+            if (std::isfinite(x)) {
+                if (std::isfinite(y)) {
+                    return exprm_ns::exp(exprm_ns::complex<realT>(tmp));
+                }
+                else {
+                    return resT{q_nan, q_nan};
+                }
+            }
+            else if (std::isnan(x)) {
+                /* x is nan */
+                if (y == realT(0)) {
+                    return resT{in};
+                }
+                else {
+                    return resT{x, q_nan};
+                }
+            }
+            else {
+                if (!sycl::signbit(x)) { /* x is +inf */
+                    if (y == realT(0)) {
+                        return resT{x, y};
+                    }
+                    else if (std::isfinite(y)) {
+                        return resT{x * sycl::cos(y), x * sycl::sin(y)};
+                    }
+                    else {
+                        /* x = +inf, y = +-inf || nan */
+                        return resT{x, q_nan};
+                    }
+                }
+                else { /* x is -inf */
+                    if (std::isfinite(y)) {
+                        realT exp_x = sycl::exp(x);
+                        return resT{exp_x * sycl::cos(y), exp_x * sycl::sin(y)};
+                    }
+                    else {
+                        /* x = -inf, y = +-inf || nan */
+                        return resT{0, 0};
+                    }
+                }
+            }
+        }
+        else {
+            return sycl::exp2(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using Exp2ContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           Exp2Functor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using Exp2StridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, Exp2Functor<argTy, resTy>>;
+
+template <typename T>
+struct Exp2OutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct Exp2ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class exp2_contig_kernel;
+
+template <typename argTy>
+sycl::event exp2_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using Exp2HS = hyperparam_detail::Exp2ContigHyperparameterSet<argTy>;
+
+    static constexpr std::uint8_t vec_sz = Exp2HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Exp2HS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct Exp2ContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!Exp2OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp2_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct Exp2TypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::exp2(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Exp2OutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class exp2_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    exp2_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, Exp2OutputType, Exp2StridedFunctor, exp2_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct Exp2StridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!Exp2OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp2_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::exp2
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
new file mode 100644
index 000000000000..c29030a6dc95
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
@@ -0,0 +1,282 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of EXPM1(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::expm1
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct Expm1Functor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+            // expm1(x + I*y) = expm1(x)*cos(y) - 2*sin(y / 2)^2 +
+            // I*exp(x)*sin(y)
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            // special cases
+            if (std::isinf(x)) {
+                if (x > realT(0)) {
+                    // positive infinity cases
+                    if (!std::isfinite(y)) {
+                        return resT{x, std::numeric_limits<realT>::quiet_NaN()};
+                    }
+                    else if (y == realT(0)) {
+                        return in;
+                    }
+                    else {
+                        return (resT{sycl::copysign(x, sycl::cos(y)),
+                                     sycl::copysign(x, sycl::sin(y))});
+                    }
+                }
+                else {
+                    // negative infinity cases
+                    if (!std::isfinite(y)) {
+                        // copy sign of y to guarantee
+                        // conj(expm1(x)) == expm1(conj(x))
+                        return resT{realT(-1), sycl::copysign(realT(0), y)};
+                    }
+                    else {
+                        return resT{realT(-1),
+                                    sycl::copysign(realT(0), sycl::sin(y))};
+                    }
+                }
+            }
+
+            if (std::isnan(x)) {
+                if (y == realT(0)) {
+                    return in;
+                }
+                else {
+                    return resT{std::numeric_limits<realT>::quiet_NaN(),
+                                std::numeric_limits<realT>::quiet_NaN()};
+                }
+            }
+
+            // x, y finite numbers
+            const realT cosY_val = sycl::cos(y);
+            const realT sinY_val = (y == 0) ? y : sycl::sin(y);
+            const realT sinhalfY_val = (y == 0) ? y : sycl::sin(y / 2);
+
+            const realT res_re =
+                sycl::expm1(x) * cosY_val - 2 * sinhalfY_val * sinhalfY_val;
+            realT res_im = sycl::exp(x) * sinY_val;
+            return resT{res_re, res_im};
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            static_assert(std::is_same_v<argT, resT>);
+            if (in == 0) {
+                return in;
+            }
+            return sycl::expm1(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using Expm1ContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           Expm1Functor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using Expm1StridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, Expm1Functor<argTy, resTy>>;
+
+template <typename T>
+struct Expm1OutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct Expm1ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class expm1_contig_kernel;
+
+template <typename argTy>
+sycl::event expm1_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using Expm1HS = hyperparam_detail::Expm1ContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = Expm1HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Expm1HS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, Expm1OutputType, Expm1ContigFunctor, expm1_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct Expm1ContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!Expm1OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = expm1_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct Expm1TypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::expm1(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Expm1OutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class expm1_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    expm1_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, Expm1OutputType, Expm1StridedFunctor, expm1_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct Expm1StridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!Expm1OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = expm1_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::expm1
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
new file mode 100644
index 000000000000..375659b94a12
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
@@ -0,0 +1,229 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of FLOOR(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::floor
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct FloorFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (std::is_integral_v<argT>) {
+            return in;
+        }
+        else {
+            if (in == 0) {
+                return in;
+            }
+            return sycl::floor(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using FloorContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           FloorFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using FloorStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, FloorFunctor<argTy, resTy>>;
+
+template <typename T>
+struct FloorOutputType
+{
+    using value_type =
+        typename std::disjunction<td_ns::TypeMapResultEntry<T, bool>,
+                                  td_ns::TypeMapResultEntry<T, std::uint8_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint16_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint32_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint64_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int8_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int16_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int32_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int64_t>,
+                                  td_ns::TypeMapResultEntry<T, sycl::half>,
+                                  td_ns::TypeMapResultEntry<T, float>,
+                                  td_ns::TypeMapResultEntry<T, double>,
+                                  td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct FloorContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class floor_contig_kernel;
+
+template <typename argTy>
+sycl::event floor_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using FloorHS = hyperparam_detail::FloorContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = FloorHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = FloorHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, FloorOutputType, FloorContigFunctor, floor_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct FloorContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!FloorOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct FloorTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::floor(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename FloorOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class floor_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    floor_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, FloorOutputType, FloorStridedFunctor, floor_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct FloorStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!FloorOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::floor
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
new file mode 100644
index 000000000000..e669a97c04ea
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -0,0 +1,546 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of FLOOR_DIVIDE(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::floor_divide
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct FloorDivideFunctor
+{
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            if (in2 == argT2(0)) {
+                return resT(0);
+            }
+            if constexpr (std::is_signed_v<argT1> || std::is_signed_v<argT2>) {
+                auto div = in1 / in2;
+                auto mod = in1 % in2;
+                auto corr = (mod != 0 && l_xor(mod < 0, in2 < 0));
+                return (div - corr);
+            }
+            else {
+                return (in1 / in2);
+            }
+        }
+        else {
+            auto div = in1 / in2;
+            return (div == resT(0)) ? div : resT(sycl::floor(div));
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        if constexpr (std::is_integral_v<resT>) {
+            sycl::vec<resT, vec_sz> res;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                if (in2[i] == argT2(0)) {
+                    res[i] = resT(0);
+                }
+                else {
+                    res[i] = in1[i] / in2[i];
+                    if constexpr (std::is_signed_v<resT>) {
+                        auto mod = in1[i] % in2[i];
+                        auto corr = (mod != 0 && l_xor(mod < 0, in2[i] < 0));
+                        res[i] -= corr;
+                    }
+                }
+            }
+            return res;
+        }
+        else {
+            auto tmp = in1 / in2;
+            using tmpT = typename decltype(tmp)::element_type;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                if (in2[i] != argT2(0)) {
+                    tmp[i] = sycl::floor(tmp[i]);
+                }
+            }
+            if constexpr (std::is_same_v<resT, tmpT>) {
+                return tmp;
+            }
+            else {
+                using dpctl::tensor::type_utils::vec_cast;
+                return vec_cast<resT, tmpT, vec_sz>(tmp);
+            }
+        }
+    }
+
+private:
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using FloorDivideContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    FloorDivideFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using FloorDivideStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    FloorDivideFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct FloorDivideOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct FloorDivideContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class floor_divide_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    floor_divide_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using FloorDivideHS =
+        hyperparam_detail::FloorDivideContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, FloorDivideOutputType, FloorDivideContigFunctor,
+        floor_divide_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!FloorDivideOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_divide_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideTypeMapFactory
+{
+    /*! @brief get typeid for output type of floor_divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename FloorDivideOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class floor_divide_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event floor_divide_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg1_p,
+    ssize_t arg1_offset,
+    const char *arg2_p,
+    ssize_t arg2_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, FloorDivideOutputType, FloorDivideStridedFunctor,
+        floor_divide_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!FloorDivideOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_divide_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct FloorDivideInplaceFunctor
+{
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    void operator()(resT &in1, const argT &in2) const
+    {
+        if constexpr (std::is_integral_v<resT>) {
+            if (in2 == argT(0)) {
+                in1 = 0;
+                return;
+            }
+            if constexpr (std::is_signed_v<resT>) {
+                auto tmp = in1;
+                in1 /= in2;
+                auto mod = tmp % in2;
+                auto corr = (mod != 0 && l_xor(mod < 0, in2 < 0));
+                in1 -= corr;
+            }
+            else {
+                in1 /= in2;
+            }
+        }
+        else {
+            in1 /= in2;
+            if (in1 == resT(0)) {
+                return;
+            }
+            in1 = sycl::floor(in1);
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &in1,
+                    const sycl::vec<argT, vec_sz> &in2) const
+    {
+        if constexpr (std::is_integral_v<resT>) {
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                if (in2[i] == argT(0)) {
+                    in1[i] = 0;
+                }
+                else {
+                    if constexpr (std::is_signed_v<resT>) {
+                        auto tmp = in1[i];
+                        in1[i] /= in2[i];
+                        auto mod = tmp % in2[i];
+                        auto corr = (mod != 0 && l_xor(mod < 0, in2[i] < 0));
+                        in1[i] -= corr;
+                    }
+                    else {
+                        in1[i] /= in2[i];
+                    }
+                }
+            }
+        }
+        else {
+            in1 /= in2;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                if (in2[i] != argT(0)) {
+                    in1[i] = sycl::floor(in1[i]);
+                }
+            }
+        }
+    }
+
+private:
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using FloorDivideInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        FloorDivideInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using FloorDivideInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        FloorDivideInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class floor_divide_inplace_contig_kernel;
+
+/* @brief Types supported by in-place floor division */
+template <typename argTy, typename resTy>
+struct FloorDivideInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct FloorDivideInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x //= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (FloorDivideInplaceTypePairSupport<argT,
+                                                        resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event floor_divide_inplace_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using FloorDivideHS =
+        hyperparam_detail::FloorDivideContigHyperparameterSet<resTy, argTy>;
+
+    static constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, FloorDivideInplaceContigFunctor,
+        floor_divide_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!FloorDivideInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_divide_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class floor_divide_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event floor_divide_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, FloorDivideInplaceStridedFunctor,
+        floor_divide_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct FloorDivideInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!FloorDivideInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = floor_divide_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::floor_divide
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
new file mode 100644
index 000000000000..9b3659faa161
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
@@ -0,0 +1,317 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of comparison of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::greater
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct GreaterFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value) {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::greater_complex;
+            return greater_complex<argT1>(in1, in2);
+        }
+        else {
+            if constexpr (std::is_integral_v<argT1> &&
+                          std::is_integral_v<argT2> &&
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
+                if constexpr (std::is_signed_v<argT1> &&
+                              !std::is_signed_v<argT2>) {
+                    return (in1 < 0) ? false : (static_cast<argT2>(in1) > in2);
+                }
+                else {
+                    if constexpr (!std::is_signed_v<argT1> &&
+                                  std::is_signed_v<argT2>) {
+                        return (in2 < 0) ? true
+                                         : (in1 > static_cast<argT1>(in2));
+                    }
+                }
+            }
+            else {
+                return (in1 > in2);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        auto tmp = (in1 > in2);
+
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using GreaterContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            GreaterFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using GreaterStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    GreaterFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct GreaterOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct GreaterContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class greater_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event greater_contig_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const char *arg1_p,
+                                ssize_t arg1_offset,
+                                const char *arg2_p,
+                                ssize_t arg2_offset,
+                                char *res_p,
+                                ssize_t res_offset,
+                                const std::vector<sycl::event> &depends = {})
+{
+    using GreaterHS =
+        hyperparam_detail::GreaterContigHyperparameterSet<argTy1, argTy2>;
+
+    static constexpr std::uint8_t vec_sz = GreaterHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = GreaterHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, GreaterOutputType, GreaterContigFunctor,
+        greater_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct GreaterContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!GreaterOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = greater_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GreaterTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename GreaterOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class greater_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    greater_strided_impl(sycl::queue &exec_q,
+                         std::size_t nelems,
+                         int nd,
+                         const ssize_t *shape_and_strides,
+                         const char *arg1_p,
+                         ssize_t arg1_offset,
+                         const char *arg2_p,
+                         ssize_t arg2_offset,
+                         char *res_p,
+                         ssize_t res_offset,
+                         const std::vector<sycl::event> &depends,
+                         const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, GreaterOutputType, GreaterStridedFunctor,
+        greater_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                arg1_offset, arg2_p, arg2_offset, res_p,
+                                res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct GreaterStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!GreaterOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = greater_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::greater
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
new file mode 100644
index 000000000000..25c56d4d40a4
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
@@ -0,0 +1,317 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of comparison of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::greater_equal
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct GreaterEqualFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value) {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::greater_equal_complex;
+            return greater_equal_complex<argT1>(in1, in2);
+        }
+        else {
+            if constexpr (std::is_integral_v<argT1> &&
+                          std::is_integral_v<argT2> &&
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
+                if constexpr (std::is_signed_v<argT1> &&
+                              !std::is_signed_v<argT2>) {
+                    return (in1 < 0) ? false : (static_cast<argT2>(in1) >= in2);
+                }
+                else {
+                    if constexpr (!std::is_signed_v<argT1> &&
+                                  std::is_signed_v<argT2>) {
+                        return (in2 < 0) ? true
+                                         : (in1 >= static_cast<argT1>(in2));
+                    }
+                }
+            }
+            else {
+                return (in1 >= in2);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        auto tmp = (in1 >= in2);
+
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using GreaterEqualContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    GreaterEqualFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using GreaterEqualStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    GreaterEqualFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct GreaterEqualOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct GreaterEqualContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class greater_equal_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    greater_equal_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg1_p,
+                              ssize_t arg1_offset,
+                              const char *arg2_p,
+                              ssize_t arg2_offset,
+                              char *res_p,
+                              ssize_t res_offset,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using GreaterEqHS =
+        hyperparam_detail::GreaterEqualContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = GreaterEqHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = GreaterEqHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, GreaterEqualOutputType, GreaterEqualContigFunctor,
+        greater_equal_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct GreaterEqualContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!GreaterEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = greater_equal_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GreaterEqualTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename GreaterEqualOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class greater_equal_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event greater_equal_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg1_p,
+    ssize_t arg1_offset,
+    const char *arg2_p,
+    ssize_t arg2_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, GreaterEqualOutputType, GreaterEqualStridedFunctor,
+        greater_equal_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct GreaterEqualStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!GreaterEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = greater_equal_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::greater_equal
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
new file mode 100644
index 000000000000..438a5eea3ae8
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
@@ -0,0 +1,249 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of HYPOT(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::hypot
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct HypotFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return sycl::hypot(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto res = sycl::hypot(in1, in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(res)::element_type>) {
+            return res;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(res)::element_type, vec_sz>(
+                res);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using HypotContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            HypotFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using HypotStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             HypotFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct HypotOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct HypotContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class hypot_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event hypot_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg1_p,
+                              ssize_t arg1_offset,
+                              const char *arg2_p,
+                              ssize_t arg2_offset,
+                              char *res_p,
+                              ssize_t res_offset,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using HypotHS =
+        hyperparam_detail::HypotContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = HypotHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = HypotHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, HypotOutputType, HypotContigFunctor,
+        hypot_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                             arg1_offset, arg2_p, arg2_offset,
+                                             res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct HypotContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!HypotOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = hypot_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct HypotTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::hypot(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename HypotOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class hypot_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    hypot_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg1_p,
+                       ssize_t arg1_offset,
+                       const char *arg2_p,
+                       ssize_t arg2_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, HypotOutputType, HypotStridedFunctor,
+        hypot_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                              arg1_offset, arg2_p, arg2_offset, res_p,
+                              res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct HypotStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!HypotOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = hypot_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::hypot
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
new file mode 100644
index 000000000000..667fb47efdc8
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
@@ -0,0 +1,232 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of IMAG(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::imag
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::is_complex_v;
+
+template <typename argT, typename resT>
+struct ImagFunctor
+{
+
+    // is function constant for given argT
+    using is_constant =
+        typename std::is_same<is_complex<argT>, std::false_type>;
+    // constant value, if constant
+    static constexpr resT constant_value = resT{0};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex_v<argT>) {
+            return std::imag(in);
+        }
+        else {
+            static_assert(std::is_same_v<resT, argT>);
+            return constant_value;
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using ImagContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ImagFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using ImagStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, ImagFunctor<argTy, resTy>>;
+
+template <typename T>
+struct ImagOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, bool>,
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, float>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct ImagContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class imag_contig_kernel;
+
+template <typename argTy>
+sycl::event imag_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using ImagHS = hyperparam_detail::ImagContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = ImagHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = ImagHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, ImagOutputType, ImagContigFunctor, imag_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct ImagContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!ImagOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = imag_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct ImagTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::imag(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename ImagOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class imag_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    imag_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, ImagOutputType, ImagStridedFunctor, imag_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct ImagStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!ImagOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = imag_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::imag
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
new file mode 100644
index 000000000000..8eb435c089d8
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
@@ -0,0 +1,228 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ISFINITE(x)
+/// function that tests whether a tensor element is finite.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::isfinite
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct IsFiniteFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    /*
+    std::is_same<argT, bool>::value ||
+                           std::is_integral<argT>::value
+    */
+    using is_constant = typename std::disjunction<std::is_same<argT, bool>,
+                                                  std::is_integral<argT>>;
+    static constexpr resT constant_value = true;
+    using supports_vec = typename std::false_type;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            const bool real_isfinite = std::isfinite(std::real(in));
+            const bool imag_isfinite = std::isfinite(std::imag(in));
+            return (real_isfinite && imag_isfinite);
+        }
+        else if constexpr (std::is_same<argT, bool>::value ||
+                           std::is_integral<argT>::value) {
+            return constant_value;
+        }
+        else if constexpr (std::is_same_v<argT, sycl::half>) {
+            return sycl::isfinite(in);
+        }
+        else {
+            return std::isfinite(in);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = sycl::isfinite(in);
+
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+
+        return vec_cast<bool, deducedT, vec_sz>(res_vec);
+    }
+};
+
+template <typename argT,
+          typename resT = bool,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using IsFiniteContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           IsFiniteFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using IsFiniteStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, IsFiniteFunctor<argTy, resTy>>;
+
+template <typename argTy>
+struct IsFiniteOutputType
+{
+    using value_type = bool;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct IsFiniteContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class isfinite_contig_kernel;
+
+template <typename argTy>
+sycl::event isfinite_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg_p,
+                                 char *res_p,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using IsFiniteHS =
+        hyperparam_detail::IsFiniteContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = IsFiniteHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = IsFiniteHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, IsFiniteOutputType, IsFiniteContigFunctor,
+        isfinite_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                depends);
+}
+
+template <typename fnT, typename T>
+struct IsFiniteContigFactory
+{
+    fnT get()
+    {
+        fnT fn = isfinite_contig_impl<T>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct IsFiniteTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::isfinite(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename IsFiniteOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class isfinite_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    isfinite_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg_p,
+                          ssize_t arg_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<argTy, IsFiniteOutputType,
+                                                  IsFiniteStridedFunctor,
+                                                  isfinite_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct IsFiniteStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = isfinite_strided_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::isfinite
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
new file mode 100644
index 000000000000..b7d85e21a1f2
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
@@ -0,0 +1,223 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ISINF(x)
+/// function that tests whether a tensor element is an infinity.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::isinf
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct IsInfFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using is_constant = typename std::disjunction<std::is_same<argT, bool>,
+                                                  std::is_integral<argT>>;
+    static constexpr resT constant_value = false;
+    using supports_vec =
+        typename std::disjunction<std::is_same<argT, sycl::half>,
+                                  std::is_floating_point<argT>>;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            const bool real_isinf = std::isinf(std::real(in));
+            const bool imag_isinf = std::isinf(std::imag(in));
+            return (real_isinf || imag_isinf);
+        }
+        else if constexpr (std::is_same<argT, bool>::value ||
+                           std::is_integral<argT>::value) {
+            return constant_value;
+        }
+        else if constexpr (std::is_same_v<argT, sycl::half>) {
+            return sycl::isinf(in);
+        }
+        else {
+            return std::isinf(in);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = sycl::isinf(in);
+
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+
+        return vec_cast<bool, deducedT, vec_sz>(res_vec);
+    }
+};
+
+template <typename argT,
+          typename resT = bool,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using IsInfContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           IsInfFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using IsInfStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, IsInfFunctor<argTy, resTy>>;
+
+template <typename argTy>
+struct IsInfOutputType
+{
+    using value_type = bool;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct IsInfContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class isinf_contig_kernel;
+
+template <typename argTy>
+sycl::event isinf_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using IsInfHS = hyperparam_detail::IsInfContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = IsInfHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = IsInfHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct IsInfContigFactory
+{
+    fnT get()
+    {
+        fnT fn = isinf_contig_impl<T>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct IsInfTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::isinf(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename IsInfOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class isinf_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    isinf_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, IsInfOutputType, IsInfStridedFunctor, isinf_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct IsInfStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = isinf_strided_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::isinf
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
new file mode 100644
index 000000000000..cad2d2239de0
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
@@ -0,0 +1,221 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ISNAN(x)
+/// function that tests whether a tensor element is a NaN.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::isnan
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct IsNanFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    /*
+    std::is_same<argT, bool>::value ||
+                           std::is_integral<argT>::value
+    */
+    using is_constant = typename std::disjunction<std::is_same<argT, bool>,
+                                                  std::is_integral<argT>>;
+    static constexpr resT constant_value = false;
+    using supports_vec = typename std::true_type;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            const bool real_isnan = sycl::isnan(std::real(in));
+            const bool imag_isnan = sycl::isnan(std::imag(in));
+            return (real_isnan || imag_isnan);
+        }
+        else if constexpr (std::is_same<argT, bool>::value ||
+                           std::is_integral<argT>::value) {
+            return constant_value;
+        }
+        else {
+            return sycl::isnan(in);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = sycl::isnan(in);
+
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+
+        return vec_cast<bool, deducedT, vec_sz>(res_vec);
+    }
+};
+
+template <typename argT,
+          typename resT = bool,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using IsNanContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           IsNanFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using IsNanStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, IsNanFunctor<argTy, resTy>>;
+
+template <typename argTy>
+struct IsNanOutputType
+{
+    using value_type = bool;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct IsNanContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class isnan_contig_kernel;
+
+template <typename argTy>
+sycl::event isnan_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using IsNanHS = hyperparam_detail::IsNanContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = IsNanHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = IsNanHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct IsNanContigFactory
+{
+    fnT get()
+    {
+        fnT fn = isnan_contig_impl<T>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct IsNanTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::isnan(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename IsNanOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class isnan_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    isnan_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, IsNanOutputType, IsNanStridedFunctor, isnan_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct IsNanStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = isnan_strided_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::isnan
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
new file mode 100644
index 000000000000..19077936372e
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
@@ -0,0 +1,314 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of comparison of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::less
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct LessFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value) {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::less_complex;
+            return less_complex<argT1>(in1, in2);
+        }
+        else {
+            if constexpr (std::is_integral_v<argT1> &&
+                          std::is_integral_v<argT2> &&
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
+                if constexpr (std::is_signed_v<argT1> &&
+                              !std::is_signed_v<argT2>) {
+                    return (in1 < 0) ? true : (static_cast<argT2>(in1) < in2);
+                }
+                else {
+                    if constexpr (!std::is_signed_v<argT1> &&
+                                  std::is_signed_v<argT2>) {
+                        return (in2 < 0) ? false
+                                         : (in1 < static_cast<argT1>(in2));
+                    }
+                }
+            }
+            else {
+                return (in1 < in2);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = (in1 < in2);
+
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LessContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            LessFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LessStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             LessFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LessOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LessContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class less_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event less_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using LessHS =
+        hyperparam_detail::LessContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LessHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LessHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LessOutputType, LessContigFunctor, less_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LessContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LessOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = less_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LessTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LessOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class less_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    less_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg1_p,
+                      ssize_t arg1_offset,
+                      const char *arg2_p,
+                      ssize_t arg2_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LessOutputType, LessStridedFunctor,
+        less_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                             arg1_offset, arg2_p, arg2_offset, res_p,
+                             res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LessStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LessOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = less_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::less
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
new file mode 100644
index 000000000000..a0b23693e70d
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
@@ -0,0 +1,316 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of comparison of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::less_equal
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct LessEqualFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value) {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::less_equal_complex;
+            return less_equal_complex<argT1>(in1, in2);
+        }
+        else {
+            if constexpr (std::is_integral_v<argT1> &&
+                          std::is_integral_v<argT2> &&
+                          std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
+                if constexpr (std::is_signed_v<argT1> &&
+                              !std::is_signed_v<argT2>) {
+                    return (in1 < 0) ? true : (static_cast<argT2>(in1) <= in2);
+                }
+                else {
+                    if constexpr (!std::is_signed_v<argT1> &&
+                                  std::is_signed_v<argT2>) {
+                        return (in2 < 0) ? false
+                                         : (in1 <= static_cast<argT1>(in2));
+                    }
+                }
+            }
+            else {
+                return (in1 <= in2);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        auto tmp = (in1 <= in2);
+
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LessEqualContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LessEqualFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LessEqualStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LessEqualFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LessEqualOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LessEqualContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class less_equal_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event less_equal_contig_impl(sycl::queue &exec_q,
+                                   std::size_t nelems,
+                                   const char *arg1_p,
+                                   ssize_t arg1_offset,
+                                   const char *arg2_p,
+                                   ssize_t arg2_offset,
+                                   char *res_p,
+                                   ssize_t res_offset,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    using LessEqHS =
+        hyperparam_detail::LessEqualContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LessEqHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LessEqHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LessEqualOutputType, LessEqualContigFunctor,
+        less_equal_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LessEqualContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LessEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = less_equal_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LessEqualTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LessEqualOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class less_equal_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    less_equal_strided_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            int nd,
+                            const ssize_t *shape_and_strides,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends,
+                            const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LessEqualOutputType, LessEqualStridedFunctor,
+        less_equal_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LessEqualStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LessEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = less_equal_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::less_equal
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
new file mode 100644
index 000000000000..05e5048f65a7
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
@@ -0,0 +1,222 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOG(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::log
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct LogFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+            return exprm_ns::log(exprm_ns::complex<realT>(in)); // log(in);
+        }
+        else {
+            return sycl::log(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           LogFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using LogStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, LogFunctor<argTy, resTy>>;
+
+template <typename T>
+struct LogOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct LogContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class log_contig_kernel;
+
+template <typename argTy>
+sycl::event log_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using LogHS = hyperparam_detail::LogContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = LogHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, LogOutputType, LogContigFunctor, log_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct LogContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct LogTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::log(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class log_strided_kernel;
+
+template <typename argTy>
+sycl::event log_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, LogOutputType, LogStridedFunctor, log_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct LogStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::log
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
new file mode 100644
index 000000000000..8ddb701ea622
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
@@ -0,0 +1,240 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOG10(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::log10
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct Log10Functor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+            // return (log(in) / log(realT{10}));
+            return exprm_ns::log(exprm_ns::complex<realT>(in)) /
+                   sycl::log(realT{10});
+        }
+        else {
+            return sycl::log10(in);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = sycl::log10(in);
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+        if constexpr (std::is_same_v<resT, deducedT>) {
+            return res_vec;
+        }
+        else {
+            return vec_cast<resT, deducedT, vec_sz>(res_vec);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using Log10ContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           Log10Functor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using Log10StridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, Log10Functor<argTy, resTy>>;
+
+template <typename T>
+struct Log10OutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct Log10ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class log10_contig_kernel;
+
+template <typename argTy>
+sycl::event log10_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using Log10HS = hyperparam_detail::Log10ContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = Log10HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Log10HS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, Log10OutputType, Log10ContigFunctor, log10_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct Log10ContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!Log10OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log10_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct Log10TypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::log10(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Log10OutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class log10_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    log10_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, Log10OutputType, Log10StridedFunctor, log10_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct Log10StridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!Log10OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log10_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::log10
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
new file mode 100644
index 000000000000..8365932aead7
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
@@ -0,0 +1,248 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOG1P(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::log1p
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+// TODO: evaluate precision against alternatives
+template <typename argT, typename resT>
+struct Log1pFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            // log1p(z) = ln((x + 1) + yI)
+            //          = ln(|(x + 1) + yi|)
+            //             + I * atan2(y, x + 1)
+            //          = ln(sqrt((x + 1)^2 + y^2))
+            //             + I *atan2(y, x + 1)
+            //          = log1p(x^2 + 2x + y^2) / 2
+            //             + I * atan2(y, x + 1)
+            using realT = typename argT::value_type;
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            // imaginary part of result
+            const realT res_im = sycl::atan2(y, x + 1);
+
+            if (std::max(sycl::fabs(x), sycl::fabs(y)) < realT{.1}) {
+                const realT v = x * (2 + x) + y * y;
+                return resT{sycl::log1p(v) / 2, res_im};
+            }
+            else {
+                // when not close to zero,
+                // prevent overflow
+                const realT m = sycl::hypot(x + 1, y);
+                return resT{sycl::log(m), res_im};
+            }
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::log1p(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using Log1pContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           Log1pFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using Log1pStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, Log1pFunctor<argTy, resTy>>;
+
+template <typename T>
+struct Log1pOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct Log1pContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class log1p_contig_kernel;
+
+template <typename argTy>
+sycl::event log1p_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using Log1pHS = hyperparam_detail::Log1pContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = Log1pHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Log1pHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, Log1pOutputType, Log1pContigFunctor, log1p_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct Log1pContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!Log1pOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log1p_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct Log1pTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::log1p(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Log1pOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class log1p_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    log1p_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, Log1pOutputType, Log1pStridedFunctor, log1p_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct Log1pStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!Log1pOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log1p_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::log1p
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
new file mode 100644
index 000000000000..3cb537b82522
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
@@ -0,0 +1,241 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOG2(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::log2
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct Log2Functor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            // log(in) / log(realT{2});
+            return exprm_ns::log(exprm_ns::complex<realT>(in)) /
+                   sycl::log(realT{2});
+        }
+        else {
+            return sycl::log2(in);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = sycl::log2(in);
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+        if constexpr (std::is_same_v<resT, deducedT>) {
+            return res_vec;
+        }
+        else {
+            return vec_cast<resT, deducedT, vec_sz>(res_vec);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using Log2ContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           Log2Functor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using Log2StridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, Log2Functor<argTy, resTy>>;
+
+template <typename T>
+struct Log2OutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct Log2ContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class log2_contig_kernel;
+
+template <typename argTy>
+sycl::event log2_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using Log2HS = hyperparam_detail::Log2ContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = Log2HS::vec_sz;
+    static constexpr std::uint8_t n_vecs = Log2HS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, Log2OutputType, Log2ContigFunctor, log2_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct Log2ContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!Log2OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log2_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct Log2TypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::log2(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Log2OutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class log2_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    log2_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, Log2OutputType, Log2StridedFunctor, log2_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct Log2StridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!Log2OutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = log2_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::log2
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
new file mode 100644
index 000000000000..3a79950672d2
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -0,0 +1,263 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+/// \file
+/// This file defines kernels for elementwise evaluation of LOGADDEXP(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "common.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels::logaddexp
+{
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT1, typename argT2, typename resT>
+struct LogAddExpFunctor
+{
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        using dpctl::tensor::math_utils::logaddexp;
+        return logaddexp<resT>(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        sycl::vec<resT, vec_sz> res;
+        auto diff = in1 - in2; // take advantange of faster vec arithmetic
+
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            if (std::isfinite(diff[i])) {
+                res[i] = std::max<resT>(in1[i], in2[i]) +
+                         impl_finite<resT>(-sycl::fabs(diff[i]));
+            }
+            else {
+                using dpctl::tensor::math_utils::logaddexp;
+                res[i] = logaddexp<resT>(in1[i], in2[i]);
+            }
+        }
+
+        return res;
+    }
+
+private:
+    template <typename T>
+    T impl_finite(T const &in) const
+    {
+        return (in > 0) ? (in + sycl::log1p(sycl::exp(-in)))
+                        : sycl::log1p(sycl::exp(in));
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogAddExpContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LogAddExpFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LogAddExpStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LogAddExpFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LogAddExpOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogAddExpContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class logaddexp_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event logaddexp_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg1_p,
+                                  ssize_t arg1_offset,
+                                  const char *arg2_p,
+                                  ssize_t arg2_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using LogAddExpHS =
+        hyperparam_detail::LogAddExpContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LogAddExpHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogAddExpHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LogAddExpOutputType, LogAddExpContigFunctor,
+        logaddexp_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogAddExpContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogAddExpOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logaddexp_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LogAddExpTypeMapFactory
+{
+    /*! @brief get typeid for output type of logaddexp(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogAddExpOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class logaddexp_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logaddexp_strided_impl(sycl::queue &exec_q,
+                           std::size_t nelems,
+                           int nd,
+                           const ssize_t *shape_and_strides,
+                           const char *arg1_p,
+                           ssize_t arg1_offset,
+                           const char *arg2_p,
+                           ssize_t arg2_offset,
+                           char *res_p,
+                           ssize_t res_offset,
+                           const std::vector<sycl::event> &depends,
+                           const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LogAddExpOutputType, LogAddExpStridedFunctor,
+        logaddexp_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                  arg1_offset, arg2_p, arg2_offset, res_p,
+                                  res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogAddExpStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogAddExpOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logaddexp_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+class logaddexp_matrix_row_broadcast_sg_krn;
+
+} // namespace dpctl::tensor::kernels::logaddexp
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
new file mode 100644
index 000000000000..39049dab8d5e
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
@@ -0,0 +1,291 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOGICAL_AND(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::logical_and
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct LogicalAndFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        using tu_ns::convert_impl;
+
+        return (convert_impl<bool, argT1>(in1) &&
+                convert_impl<bool, argT2>(in2));
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        auto tmp = (in1 && in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogicalAndContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LogicalAndFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LogicalAndStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LogicalAndFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LogicalAndOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogicalAndContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class logical_and_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_and_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using LogicalAndHS =
+        hyperparam_detail::LogicalAndContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LogicalAndHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalAndHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LogicalAndOutputType, LogicalAndContigFunctor,
+        logical_and_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalAndContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalAndOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_and_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalAndTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogicalAndOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class logical_and_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_and_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LogicalAndOutputType, LogicalAndStridedFunctor,
+        logical_and_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalAndStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalAndOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_and_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::logical_and
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
new file mode 100644
index 000000000000..b8f1c042ca73
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
@@ -0,0 +1,199 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOGICAL_NOT(x)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::logical_not
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT, typename resT>
+struct LogicalNotFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::false_type;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<tu_ns::is_complex<resT>, tu_ns::is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        using tu_ns::convert_impl;
+        return !convert_impl<bool, argT>(in);
+    }
+};
+
+template <typename argT,
+          typename resT = bool,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogicalNotContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           LogicalNotFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using LogicalNotStridedFunctor =
+    elementwise_common::UnaryStridedFunctor<argTy,
+                                            resTy,
+                                            IndexerT,
+                                            LogicalNotFunctor<argTy, resTy>>;
+
+template <typename argTy>
+struct LogicalNotOutputType
+{
+    using value_type = bool;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct LogicalNotContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class logical_not_contig_kernel;
+
+template <typename argTy>
+sycl::event
+    logical_not_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using LogicalNotHS =
+        hyperparam_detail::LogicalNotContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = LogicalNotHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalNotHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, LogicalNotOutputType, LogicalNotContigFunctor,
+        logical_not_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                   depends);
+}
+
+template <typename fnT, typename T>
+struct LogicalNotContigFactory
+{
+    fnT get()
+    {
+        fnT fn = logical_not_contig_impl<T>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T>
+struct LogicalNotTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::logical_not(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogicalNotOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class logical_not_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    logical_not_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<argTy, LogicalNotOutputType,
+                                                  LogicalNotStridedFunctor,
+                                                  logical_not_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct LogicalNotStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = logical_not_strided_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::logical_not
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
new file mode 100644
index 000000000000..637e7681e7c0
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
@@ -0,0 +1,290 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOGICAL_OR(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::logical_or
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct LogicalOrFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        using tu_ns::convert_impl;
+
+        return (convert_impl<bool, argT1>(in1) ||
+                convert_impl<bool, argT2>(in2));
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        auto tmp = (in1 || in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogicalOrContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LogicalOrFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LogicalOrStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LogicalOrFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LogicalOrOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogicalOrContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class logical_or_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event logical_or_contig_impl(sycl::queue &exec_q,
+                                   std::size_t nelems,
+                                   const char *arg1_p,
+                                   ssize_t arg1_offset,
+                                   const char *arg2_p,
+                                   ssize_t arg2_offset,
+                                   char *res_p,
+                                   ssize_t res_offset,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    using LogicalOrHS =
+        hyperparam_detail::LogicalOrContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LogicalOrHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalOrHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LogicalOrOutputType, LogicalOrContigFunctor,
+        logical_or_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalOrContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalOrOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_or_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalOrTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogicalOrOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class logical_or_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_or_strided_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            int nd,
+                            const ssize_t *shape_and_strides,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends,
+                            const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LogicalOrOutputType, LogicalOrStridedFunctor,
+        logical_or_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalOrStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalOrOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_or_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::logical_or
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
new file mode 100644
index 000000000000..698e4d9ab5c1
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
@@ -0,0 +1,292 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOGICAL_XOR(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::logical_xor
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct LogicalXorFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        using tu_ns::convert_impl;
+
+        return (convert_impl<bool, argT1>(in1) !=
+                convert_impl<bool, argT2>(in2));
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        using tu_ns::vec_cast;
+        auto tmp1 = vec_cast<bool, argT1, vec_sz>(in1);
+        auto tmp2 = vec_cast<bool, argT2, vec_sz>(in2);
+
+        auto tmp = (tmp1 != tmp2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogicalXorContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LogicalXorFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LogicalXorStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LogicalXorFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LogicalXorOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogicalXorContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class logical_xor_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_xor_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using LogicalXorHS =
+        hyperparam_detail::LogicalXorContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LogicalXorHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalXorHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LogicalXorOutputType, LogicalXorContigFunctor,
+        logical_xor_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalXorContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalXorOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_xor_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalXorTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogicalXorOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class logical_xor_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_xor_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LogicalXorOutputType, LogicalXorStridedFunctor,
+        logical_xor_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalXorStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalXorOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_xor_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::logical_xor
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
new file mode 100644
index 000000000000..52494cceba93
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -0,0 +1,321 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of MAXIMUM(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "common.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels::maximum
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct MaximumFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value) {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::max_complex;
+            return max_complex<argT1>(in1, in2);
+        }
+        else if constexpr (std::is_floating_point_v<argT1> ||
+                           std::is_same_v<argT1, sycl::half>) {
+            const bool choose_first = (sycl::isnan(in1) || (in1 > in2));
+            return (choose_first) ? in1 : in2;
+        }
+        else {
+            return (in1 > in2) ? in1 : in2;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        sycl::vec<resT, vec_sz> res;
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            const auto &v1 = in1[i];
+            const auto &v2 = in2[i];
+            if constexpr (std::is_floating_point_v<argT1> ||
+                          std::is_same_v<argT1, sycl::half>) {
+                const bool choose_first = (sycl::isnan(v1) || (v1 > v2));
+                res[i] = (choose_first) ? v1 : v2;
+            }
+            else {
+                res[i] = (v1 > v2) ? v1 : v2;
+            }
+        }
+        return res;
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using MaximumContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            MaximumFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using MaximumStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    MaximumFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct MaximumOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct MaximumContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class maximum_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event maximum_contig_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const char *arg1_p,
+                                ssize_t arg1_offset,
+                                const char *arg2_p,
+                                ssize_t arg2_offset,
+                                char *res_p,
+                                ssize_t res_offset,
+                                const std::vector<sycl::event> &depends = {})
+{
+    using MaxHS =
+        hyperparam_detail::MaximumContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = MaxHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MaxHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, MaximumOutputType, MaximumContigFunctor,
+        maximum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MaximumContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!MaximumOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = maximum_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct MaximumTypeMapFactory
+{
+    /*! @brief get typeid for output type of maximum(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename MaximumOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class maximum_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    maximum_strided_impl(sycl::queue &exec_q,
+                         std::size_t nelems,
+                         int nd,
+                         const ssize_t *shape_and_strides,
+                         const char *arg1_p,
+                         ssize_t arg1_offset,
+                         const char *arg2_p,
+                         ssize_t arg2_offset,
+                         char *res_p,
+                         ssize_t res_offset,
+                         const std::vector<sycl::event> &depends,
+                         const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, MaximumOutputType, MaximumStridedFunctor,
+        maximum_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                arg1_offset, arg2_p, arg2_offset, res_p,
+                                res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MaximumStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!MaximumOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = maximum_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+} // namespace dpctl::tensor::kernels::maximum
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
new file mode 100644
index 000000000000..c11961f8c5c0
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -0,0 +1,321 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of MINIMUM(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "common.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/math_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels::minimum
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct MinimumFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value ||
+                      tu_ns::is_complex<argT2>::value) {
+            static_assert(std::is_same_v<argT1, argT2>);
+            using dpctl::tensor::math_utils::min_complex;
+            return min_complex<argT1>(in1, in2);
+        }
+        else if constexpr (std::is_floating_point_v<argT1> ||
+                           std::is_same_v<argT1, sycl::half>) {
+            const bool choose_first = sycl::isnan(in1) || (in1 < in2);
+            return (choose_first) ? in1 : in2;
+        }
+        else {
+            return (in1 < in2) ? in1 : in2;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        sycl::vec<resT, vec_sz> res;
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            const auto &v1 = in1[i];
+            const auto &v2 = in2[i];
+            if constexpr (std::is_floating_point_v<argT1> ||
+                          std::is_same_v<argT1, sycl::half>) {
+                const bool choose_first = sycl::isnan(v1) || (v1 < v2);
+                res[i] = (choose_first) ? v1 : v2;
+            }
+            else {
+                res[i] = (v1 < v2) ? v1 : v2;
+            }
+        }
+        return res;
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using MinimumContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            MinimumFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using MinimumStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    MinimumFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct MinimumOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct MinimumContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class minimum_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event minimum_contig_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const char *arg1_p,
+                                ssize_t arg1_offset,
+                                const char *arg2_p,
+                                ssize_t arg2_offset,
+                                char *res_p,
+                                ssize_t res_offset,
+                                const std::vector<sycl::event> &depends = {})
+{
+    using MinHS =
+        hyperparam_detail::MinimumContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = MinHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MinHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, MinimumOutputType, MinimumContigFunctor,
+        minimum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
+                                               arg1_offset, arg2_p, arg2_offset,
+                                               res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MinimumContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!MinimumOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = minimum_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct MinimumTypeMapFactory
+{
+    /*! @brief get typeid for output type of minimum(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename MinimumOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class minimum_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    minimum_strided_impl(sycl::queue &exec_q,
+                         std::size_t nelems,
+                         int nd,
+                         const ssize_t *shape_and_strides,
+                         const char *arg1_p,
+                         ssize_t arg1_offset,
+                         const char *arg2_p,
+                         ssize_t arg2_offset,
+                         char *res_p,
+                         ssize_t res_offset,
+                         const std::vector<sycl::event> &depends,
+                         const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, MinimumOutputType, MinimumStridedFunctor,
+        minimum_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                arg1_offset, arg2_p, arg2_offset, res_p,
+                                res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MinimumStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!MinimumOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = minimum_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+} // namespace dpctl::tensor::kernels::minimum
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
new file mode 100644
index 000000000000..58ff88b3afeb
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
@@ -0,0 +1,641 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of MUL(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::kernels::multiply
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct MultiplyFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value) {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<realT1>(in1) *
+                   exprm_ns::complex<realT2>(in2);
+        }
+        else {
+            return in1 * in2;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = in1 * in2;
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using MultiplyContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            MultiplyFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using MultiplyStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    MultiplyFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct MultiplyOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct MultiplyContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class multiply_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event multiply_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg1_p,
+                                 ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 ssize_t arg2_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using MulHS =
+        hyperparam_detail::MultiplyContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = MulHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MulHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, MultiplyOutputType, MultiplyContigFunctor,
+        multiply_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyTypeMapFactory
+{
+    /*! @brief get typeid for output type of multiply(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename MultiplyOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class multiply_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    multiply_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg1_p,
+                          ssize_t arg1_offset,
+                          const char *arg2_p,
+                          ssize_t arg2_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, MultiplyOutputType, MultiplyStridedFunctor,
+        multiply_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                 arg1_offset, arg2_p, arg2_offset, res_p,
+                                 res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+class multiply_matrix_row_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+using MultiplyContigMatrixContigRowBroadcastingFunctor =
+    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        MultiplyFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event multiply_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] * vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, MultiplyContigMatrixContigRowBroadcastingFunctor,
+        multiply_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
+                                              mat_offset, vec_p, vec_offset,
+                                              res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyContigMatrixContigRowBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename MultiplyOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    multiply_contig_matrix_contig_row_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event multiply_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] * vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return multiply_contig_matrix_contig_row_broadcast_impl<argT2, argT1, resT>(
+        exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p,
+        res_offset, depends);
+};
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyContigRowContigMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename MultiplyOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    multiply_contig_row_contig_matrix_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct MultiplyInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in) { res *= in; }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        res *= in;
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using MultiplyInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        MultiplyInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using MultiplyInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        MultiplyInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class multiply_inplace_contig_kernel;
+
+/* @brief Types supported by in-place multiplication */
+template <typename argTy, typename resTy>
+struct MultiplyInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    resTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    resTy,
+                                    std::complex<double>>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct MultiplyInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x *= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (MultiplyInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    multiply_inplace_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg_p,
+                                 ssize_t arg_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using MulHS =
+        hyperparam_detail::MultiplyContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = MulHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MulHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, MultiplyInplaceContigFunctor,
+        multiply_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class multiply_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event multiply_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, MultiplyInplaceStridedFunctor,
+        multiply_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
+                                         arg_p, arg_offset, res_p, res_offset,
+                                         depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+class multiply_inplace_row_matrix_broadcast_sg_krn;
+
+template <typename argT, typename resT>
+using MultiplyInplaceRowMatrixBroadcastingFunctor =
+    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
+        argT,
+        resT,
+        MultiplyInplaceFunctor<argT, resT>>;
+
+template <typename argT, typename resT>
+sycl::event multiply_inplace_row_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
+        argT, resT, MultiplyInplaceRowMatrixBroadcastingFunctor,
+        multiply_inplace_row_matrix_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset,
+        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyInplaceRowMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value) {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn = multiply_inplace_row_matrix_broadcast_impl<T1, T2>;
+                return fn;
+            }
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::multiply
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
new file mode 100644
index 000000000000..e0ac856a3818
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -0,0 +1,219 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of NEGATIVE(x)
+/// function that returns -x.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::negative
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct NegativeFunctor
+{
+
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::false_type;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &x) const { return -x; }
+};
+
+template <typename argT,
+          typename resT = argT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using NegativeContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           NegativeFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename T>
+struct NegativeOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct NegativeContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class negative_contig_kernel;
+
+template <typename argTy>
+sycl::event negative_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg_p,
+                                 char *res_p,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using NegHS = hyperparam_detail::NegativeContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = NegHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = NegHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, NegativeOutputType, NegativeContigFunctor,
+        negative_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                depends);
+}
+
+template <typename fnT, typename T>
+struct NegativeContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!NegativeOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = negative_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct NegativeTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::negative(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename NegativeOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename argTy, typename resTy, typename IndexerT>
+using NegativeStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, NegativeFunctor<argTy, resTy>>;
+
+template <typename T1, typename T2, typename T3>
+class negative_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    negative_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg_p,
+                          ssize_t arg_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<argTy, NegativeOutputType,
+                                                  NegativeStridedFunctor,
+                                                  negative_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct NegativeStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!NegativeOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = negative_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::negative
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
new file mode 100644
index 000000000000..a703892a7606
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
@@ -0,0 +1,248 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of NEXTAFTER(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::nextafter
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct NextafterFunctor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return sycl::nextafter(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto res = sycl::nextafter(in1, in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(res)::element_type>) {
+            return res;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(res)::element_type, vec_sz>(
+                res);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using NextafterContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    NextafterFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using NextafterStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    NextafterFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct NextafterOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct NextafterContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class nextafter_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event nextafter_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg1_p,
+                                  ssize_t arg1_offset,
+                                  const char *arg2_p,
+                                  ssize_t arg2_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using NextafterHS =
+        hyperparam_detail::NextafterContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = NextafterHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = NextafterHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, NextafterOutputType, NextafterContigFunctor,
+        nextafter_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct NextafterContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!NextafterOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = nextafter_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct NextafterTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::nextafter(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename NextafterOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class nextafter_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    nextafter_strided_impl(sycl::queue &exec_q,
+                           std::size_t nelems,
+                           int nd,
+                           const ssize_t *shape_and_strides,
+                           const char *arg1_p,
+                           ssize_t arg1_offset,
+                           const char *arg2_p,
+                           ssize_t arg2_offset,
+                           char *res_p,
+                           ssize_t res_offset,
+                           const std::vector<sycl::event> &depends,
+                           const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, NextafterOutputType, NextafterStridedFunctor,
+        nextafter_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                  arg1_offset, arg2_p, arg2_offset, res_p,
+                                  res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct NextafterStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!NextafterOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = nextafter_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::nextafter
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
new file mode 100644
index 000000000000..007f374b6386
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
@@ -0,0 +1,303 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of inequality of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::not_equal
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct NotEqualFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> && std::is_integral_v<argT2> &&
+                      std::is_signed_v<argT1> != std::is_signed_v<argT2>) {
+            if constexpr (std::is_signed_v<argT1> && !std::is_signed_v<argT2>) {
+                return (in1 < 0) ? true : (static_cast<argT2>(in1) != in2);
+            }
+            else {
+                if constexpr (!std::is_signed_v<argT1> &&
+                              std::is_signed_v<argT2>) {
+                    return (in2 < 0) ? true : (in1 != static_cast<argT1>(in2));
+                }
+            }
+        }
+        else {
+            return (in1 != in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = (in1 != in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using NotEqualContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            NotEqualFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using NotEqualStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    NotEqualFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct NotEqualOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct NotEqualContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class not_equal_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event not_equal_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg1_p,
+                                  ssize_t arg1_offset,
+                                  const char *arg2_p,
+                                  ssize_t arg2_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using NotEqHS =
+        hyperparam_detail::NotEqualContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = NotEqHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = NotEqHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, NotEqualOutputType, NotEqualContigFunctor,
+        not_equal_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct NotEqualContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!NotEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = not_equal_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct NotEqualTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()!=(x, y), always bool */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename NotEqualOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class not_equal_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    not_equal_strided_impl(sycl::queue &exec_q,
+                           std::size_t nelems,
+                           int nd,
+                           const ssize_t *shape_and_strides,
+                           const char *arg1_p,
+                           ssize_t arg1_offset,
+                           const char *arg2_p,
+                           ssize_t arg2_offset,
+                           char *res_p,
+                           ssize_t res_offset,
+                           const std::vector<sycl::event> &depends,
+                           const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, NotEqualOutputType, NotEqualStridedFunctor,
+        not_equal_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                  arg1_offset, arg2_p, arg2_offset, res_p,
+                                  res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct NotEqualStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!NotEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = not_equal_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::not_equal
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
new file mode 100644
index 000000000000..fb351b6e50d2
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -0,0 +1,235 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of POSITIVE(x)
+/// function that returns +x.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::positive
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct PositiveFunctor
+{
+
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &x) const { return x; }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = in;
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+        if constexpr (std::is_same_v<resT, deducedT>) {
+            return res_vec;
+        }
+        else {
+            return vec_cast<resT, deducedT, vec_sz>(res_vec);
+        }
+    }
+};
+
+template <typename argT,
+          typename resT = argT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using PositiveContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           PositiveFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename T>
+struct PositiveOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct PositiveContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class positive_contig_kernel;
+
+template <typename argTy>
+sycl::event positive_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg_p,
+                                 char *res_p,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using PosHS = hyperparam_detail::PositiveContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = PosHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = PosHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, PositiveOutputType, PositiveContigFunctor,
+        positive_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                depends);
+}
+
+template <typename fnT, typename T>
+struct PositiveContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!PositiveOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = positive_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct PositiveTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::positive(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename PositiveOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename argTy, typename resTy, typename IndexerT>
+using PositiveStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, PositiveFunctor<argTy, resTy>>;
+
+template <typename T1, typename T2, typename T3>
+class positive_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    positive_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg_p,
+                          ssize_t arg_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<argTy, PositiveOutputType,
+                                                  PositiveStridedFunctor,
+                                                  positive_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct PositiveStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!PositiveOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = positive_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::positive
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
new file mode 100644
index 000000000000..1c669ec894d2
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -0,0 +1,599 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of POW(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::kernels::pow
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct PowFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            auto tmp1 = in1;
+            auto tmp2 = in2;
+            if constexpr (std::is_signed_v<argT2>) {
+                if (tmp2 < 0) {
+                    // invalid; return 0
+                    return resT(0);
+                }
+            }
+            resT res = 1;
+            if (tmp1 == 1 || tmp2 == 0) {
+                return res;
+            }
+            while (tmp2 > 0) {
+                if (tmp2 & 1) {
+                    res *= tmp1;
+                }
+                tmp2 >>= 1;
+                tmp1 *= tmp1;
+            }
+            return res;
+        }
+        else if constexpr (tu_ns::is_complex<argT1>::value &&
+                           tu_ns::is_complex<argT2>::value) {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::pow(exprm_ns::complex<realT1>(in1),
+                                 exprm_ns::complex<realT2>(in2));
+        }
+        else {
+            return sycl::pow(in1, in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            sycl::vec<resT, vec_sz> res;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                auto tmp1 = in1[i];
+                auto tmp2 = in2[i];
+                if constexpr (std::is_signed_v<argT2>) {
+                    if (tmp2 < 0) {
+                        // invalid; yield 0
+                        res[i] = 0;
+                        continue;
+                    }
+                }
+                resT res_tmp = 1;
+                if (tmp1 == 1 || tmp2 == 0) {
+                    res[i] = res_tmp;
+                    continue;
+                }
+                while (tmp2 > 0) {
+                    if (tmp2 & 1) {
+                        res_tmp *= tmp1;
+                    }
+                    tmp2 >>= 1;
+                    tmp1 *= tmp1;
+                }
+                res[i] = res_tmp;
+            }
+            return res;
+        }
+        else {
+            auto res = sycl::pow(in1, in2);
+            if constexpr (std::is_same_v<
+                              resT, typename decltype(res)::element_type>) {
+                return res;
+            }
+            else {
+                using dpctl::tensor::type_utils::vec_cast;
+
+                return vec_cast<resT, typename decltype(res)::element_type,
+                                vec_sz>(res);
+            }
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using PowContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            PowFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using PowStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             PowFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct PowOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct PowContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class pow_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event pow_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using PowHS = hyperparam_detail::PowContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = PowHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = PowHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, PowOutputType, PowContigFunctor, pow_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct PowContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!PowOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct PowTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::pow(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename PowOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class pow_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event pow_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, PowOutputType, PowStridedFunctor, pow_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct PowStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!PowOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct PowInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+            auto tmp1 = res;
+            auto tmp2 = in;
+            if constexpr (std::is_signed_v<argT>) {
+                if (tmp2 < 0) {
+                    // invalid; return 0
+                    res = 0;
+                    return;
+                }
+            }
+            if (tmp1 == 1) {
+                return;
+            }
+            if (tmp2 == 0) {
+                res = 1;
+                return;
+            }
+            resT res_tmp = 1;
+            while (tmp2 > 0) {
+                if (tmp2 & 1) {
+                    res_tmp *= tmp1;
+                }
+                tmp2 >>= 1;
+                tmp1 *= tmp1;
+            }
+            res = res_tmp;
+        }
+        else if constexpr (tu_ns::is_complex<argT>::value &&
+                           tu_ns::is_complex<resT>::value) {
+            using r_resT = typename resT::value_type;
+            using r_argT = typename argT::value_type;
+
+            res = exprm_ns::pow(exprm_ns::complex<r_resT>(res),
+                                exprm_ns::complex<r_argT>(in));
+        }
+        else {
+            res = sycl::pow(res, in);
+        }
+        return;
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                auto tmp1 = res[i];
+                auto tmp2 = in[i];
+                if constexpr (std::is_signed_v<argT>) {
+                    if (tmp2 < 0) {
+                        // invalid; return 0
+                        res[i] = 0;
+                        continue;
+                    }
+                }
+                if (tmp1 == 1) {
+                    continue;
+                }
+                if (tmp2 == 0) {
+                    res[i] = 1;
+                    continue;
+                }
+                resT res_tmp = 1;
+                while (tmp2 > 0) {
+                    if (tmp2 & 1) {
+                        res_tmp *= tmp1;
+                    }
+                    tmp2 >>= 1;
+                    tmp1 *= tmp1;
+                }
+                res[i] = res_tmp;
+            }
+        }
+        else {
+            res = sycl::pow(res, in);
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
+    argT,
+    resT,
+    PowInplaceFunctor<argT, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using PowInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        PowInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class pow_inplace_contig_kernel;
+
+/* @brief Types supported by in-place pow */
+template <typename argTy, typename resTy>
+struct PowInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    resTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    resTy,
+                                    std::complex<double>>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct PowInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x **= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (PowInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    pow_inplace_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            ssize_t arg_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using PowHS = hyperparam_detail::PowContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = PowHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = PowHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, arg_offset, res_p, res_offset,
+                        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct PowInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!PowInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class pow_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+    pow_inplace_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, PowInplaceStridedFunctor, pow_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct PowInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!PowInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::pow
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
new file mode 100644
index 000000000000..039da657cfd2
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
@@ -0,0 +1,239 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of PROJ(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::proj
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct ProjFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::false_type;
+
+    resT operator()(const argT &in) const
+    {
+        using realT = typename argT::value_type;
+        const realT x = std::real(in);
+        const realT y = std::imag(in);
+
+        if (std::isinf(x)) {
+            return value_at_infinity(y);
+        }
+        else if (std::isinf(y)) {
+            return value_at_infinity(y);
+        }
+        else {
+            return in;
+        }
+    }
+
+private:
+    template <typename T>
+    std::complex<T> value_at_infinity(const T &y) const
+    {
+        const T res_im = sycl::copysign(T(0), y);
+        return std::complex<T>{std::numeric_limits<T>::infinity(), res_im};
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using ProjContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ProjFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using ProjStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, ProjFunctor<argTy, resTy>>;
+
+template <typename T>
+struct ProjOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct ProjContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class proj_contig_kernel;
+
+template <typename argTy>
+sycl::event proj_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using ProjHS = hyperparam_detail::ProjContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = ProjHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = ProjHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, ProjOutputType, ProjContigFunctor, proj_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct ProjContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!ProjOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (std::is_same_v<T, std::complex<double>>) {
+                fnT fn = proj_contig_impl<T>;
+                return fn;
+            }
+            else {
+                fnT fn = proj_contig_impl<T>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct ProjTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::proj(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename ProjOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class proj_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    proj_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, ProjOutputType, ProjStridedFunctor, proj_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct ProjStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!ProjOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = proj_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::proj
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
new file mode 100644
index 000000000000..d21a9e6baa7d
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
@@ -0,0 +1,231 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of REAL(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::real
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::is_complex_v;
+
+template <typename argT, typename resT>
+struct RealFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex_v<argT>) {
+            return std::real(in);
+        }
+        else {
+            static_assert(std::is_same_v<resT, argT>);
+            return in;
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using RealContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           RealFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using RealStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, RealFunctor<argTy, resTy>>;
+
+template <typename T>
+struct RealOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, bool>,
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, float>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct RealContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class real_contig_kernel;
+
+template <typename argTy>
+sycl::event real_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using RealHS = hyperparam_detail::RealContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = RealHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RealHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, RealOutputType, RealContigFunctor, real_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct RealContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!RealOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = real_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct RealTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::real(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename RealOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class real_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    real_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, RealOutputType, RealStridedFunctor, real_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct RealStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!RealOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = real_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::real
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
new file mode 100644
index 000000000000..f26f4043c9ab
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
@@ -0,0 +1,229 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of RECIPROCAL(x)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::reciprocal
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct ReciprocalFunctor
+{
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+
+            using realT = typename argT::value_type;
+
+            return realT(1) / exprm_ns::complex<realT>(in);
+        }
+        else {
+            return argT(1) / in;
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using ReciprocalContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           ReciprocalFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using ReciprocalStridedFunctor =
+    elementwise_common::UnaryStridedFunctor<argTy,
+                                            resTy,
+                                            IndexerT,
+                                            ReciprocalFunctor<argTy, resTy>>;
+
+template <typename T>
+struct ReciprocalOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct ReciprocalContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class reciprocal_contig_kernel;
+
+template <typename argTy>
+sycl::event reciprocal_contig_impl(sycl::queue &exec_q,
+                                   std::size_t nelems,
+                                   const char *arg_p,
+                                   char *res_p,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    using RecipHS = hyperparam_detail::ReciprocalContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = RecipHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RecipHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, ReciprocalOutputType, ReciprocalContigFunctor,
+        reciprocal_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
+                                                  depends);
+}
+
+template <typename fnT, typename T>
+struct ReciprocalContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!ReciprocalOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = reciprocal_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct ReciprocalTypeMapFactory
+{
+    /*! @brief get typeid for output type of 1 / x */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename ReciprocalOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class reciprocal_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    reciprocal_strided_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            int nd,
+                            const ssize_t *shape_and_strides,
+                            const char *arg_p,
+                            ssize_t arg_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends,
+                            const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<argTy, ReciprocalOutputType,
+                                                  ReciprocalStridedFunctor,
+                                                  reciprocal_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct ReciprocalStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!ReciprocalOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = reciprocal_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::reciprocal
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
new file mode 100644
index 000000000000..65cd97dbe56d
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
@@ -0,0 +1,572 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of REMAINDER(x1, x2)
+/// function that computes the Python modulus operator, which is specifically
+/// designed as the complement to floor_divide(x1, x2).
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::kernels::remainder
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct RemainderFunctor
+{
+    static_assert(std::is_same_v<argT1, argT2>);
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            if (in2 == argT2(0)) {
+                return resT(0);
+            }
+            if constexpr (std::is_signed_v<argT1> || std::is_signed_v<argT2>) {
+                auto out = (in1 % in2);
+                if (out != 0 && l_xor(in1 < 0, in2 < 0)) {
+                    out += in2;
+                }
+                return out;
+            }
+            else {
+                return (in1 % in2);
+            }
+        }
+        else {
+            auto rem = sycl::fmod(in1, in2);
+            if (rem) {
+                if (l_xor(in2 < 0, rem < 0)) {
+                    rem += in2;
+                }
+            }
+            else {
+                rem = sycl::copysign(resT(0), in2);
+            }
+            return rem;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            sycl::vec<resT, vec_sz> rem;
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (in2[i] == argT2(0)) {
+                    rem[i] = resT(0);
+                }
+                else {
+                    rem[i] = in1[i] % in2[i];
+                    if constexpr (std::is_signed_v<argT1> ||
+                                  std::is_signed_v<argT2>) {
+                        if (rem[i] != 0 && l_xor(in1[i] < 0, in2[i] < 0)) {
+                            rem[i] += in2[i];
+                        }
+                    }
+                }
+            }
+            return rem;
+        }
+        else {
+            auto rem = sycl::fmod(in1, in2);
+            using remT = typename decltype(rem)::element_type;
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (rem[i]) {
+                    if (l_xor(in2[i] < 0, rem[i] < 0)) {
+                        rem[i] += in2[i];
+                    }
+                }
+                else {
+                    rem[i] = sycl::copysign(remT(0), in2[i]);
+                }
+            }
+            if constexpr (std::is_same_v<resT, remT>) {
+                return rem;
+            }
+            else {
+                using dpctl::tensor::type_utils::vec_cast;
+
+                return vec_cast<resT, remT, vec_sz>(rem);
+            }
+        }
+    }
+
+private:
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using RemainderContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    RemainderFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using RemainderStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    RemainderFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct RemainderOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct RemainderContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class remainder_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event remainder_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg1_p,
+                                  ssize_t arg1_offset,
+                                  const char *arg2_p,
+                                  ssize_t arg2_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using RemHS =
+        hyperparam_detail::RemainderContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = RemHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RemHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, RemainderOutputType, RemainderContigFunctor,
+        remainder_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!RemainderOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderTypeMapFactory
+{
+    /*! @brief get typeid for output type of remainder(T x, T y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename RemainderOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class remainder_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    remainder_strided_impl(sycl::queue &exec_q,
+                           std::size_t nelems,
+                           int nd,
+                           const ssize_t *shape_and_strides,
+                           const char *arg1_p,
+                           ssize_t arg1_offset,
+                           const char *arg2_p,
+                           ssize_t arg2_offset,
+                           char *res_p,
+                           ssize_t res_offset,
+                           const std::vector<sycl::event> &depends,
+                           const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, RemainderOutputType, RemainderStridedFunctor,
+        remainder_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                  arg1_offset, arg2_p, arg2_offset, res_p,
+                                  res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!RemainderOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct RemainderInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    // functor is only well-defined when argT and resT are the same
+    static_assert(std::is_same_v<argT, resT>);
+
+    void operator()(resT &res, const argT &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+            if (in == argT(0)) {
+                res = 0;
+                return;
+            }
+            if constexpr (std::is_signed_v<argT> || std::is_signed_v<resT>) {
+                auto tmp = res;
+                res %= in;
+                if (res != resT(0) && l_xor(tmp < 0, in < 0)) {
+                    res += in;
+                }
+            }
+            else {
+                res %= in;
+            }
+        }
+        else {
+            res = sycl::fmod(res, in);
+            if (res) {
+                if (l_xor(in < 0, res < 0)) {
+                    res += in;
+                }
+            }
+            else {
+                res = sycl::copysign(resT(0), in);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (in[i] == argT(0)) {
+                    res[i] = 0;
+                }
+                else {
+                    auto rem = res[i] % in[i];
+                    if constexpr (std::is_signed_v<argT> ||
+                                  std::is_signed_v<resT>) {
+                        if (rem != 0 && l_xor(res[i] < 0, in[i] < 0)) {
+                            rem += in[i];
+                        }
+                    }
+                    res[i] = rem;
+                }
+            }
+        }
+        else {
+            res = sycl::fmod(res, in);
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (res[i]) {
+                    if (l_xor(in[i] < 0, res[i] < 0)) {
+                        res[i] += in[i];
+                    }
+                }
+                else {
+                    res[i] = sycl::copysign(resT(0), in[i]);
+                }
+            }
+        }
+    }
+
+private:
+    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using RemainderInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        RemainderInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using RemainderInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        RemainderInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class remainder_inplace_contig_kernel;
+
+/* @brief Types supported by in-place remainder */
+template <typename argTy, typename resTy>
+struct RemainderInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct RemainderInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x %= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (RemainderInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    remainder_inplace_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg_p,
+                                  ssize_t arg_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using RemHS =
+        hyperparam_detail::RemainderContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = RemHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RemHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, RemainderInplaceContigFunctor,
+        remainder_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!RemainderInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class remainder_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event remainder_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, RemainderInplaceStridedFunctor,
+        remainder_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
+                                          arg_p, arg_offset, res_p, res_offset,
+                                          depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!RemainderInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::remainder
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
new file mode 100644
index 000000000000..b20166a4d505
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
@@ -0,0 +1,241 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of ROUND(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::round
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct RoundFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+
+        if constexpr (std::is_integral_v<argT>) {
+            return in;
+        }
+        else if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+            return resT{round_func<realT>(std::real(in)),
+                        round_func<realT>(std::imag(in))};
+        }
+        else {
+            return round_func<argT>(in);
+        }
+    }
+
+private:
+    template <typename T>
+    T round_func(const T &input) const
+    {
+        return sycl::rint(input);
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using RoundContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           RoundFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using RoundStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, RoundFunctor<argTy, resTy>>;
+
+template <typename T>
+struct RoundOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct RoundContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class round_contig_kernel;
+
+template <typename argTy>
+sycl::event round_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using RoundHS = hyperparam_detail::RoundContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = RoundHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RoundHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, RoundOutputType, RoundContigFunctor, round_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct RoundContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!RoundOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = round_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct RoundTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::round(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename RoundOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class round_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    round_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, RoundOutputType, RoundStridedFunctor, round_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct RoundStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!RoundOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = round_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::round
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
new file mode 100644
index 000000000000..aa4f1113d839
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -0,0 +1,206 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of RSQRT(x)
+/// function that computes the reciprocal square root.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::kernels::rsqrt
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT, typename resT>
+struct RsqrtFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::true_type;
+
+    resT operator()(const argT &in) const { return sycl::rsqrt(in); }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using RsqrtContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           RsqrtFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using RsqrtStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, RsqrtFunctor<argTy, resTy>>;
+
+template <typename T>
+struct RsqrtOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct RsqrtContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class rsqrt_contig_kernel;
+
+template <typename argTy>
+sycl::event rsqrt_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using RsqrtHS = hyperparam_detail::RsqrtContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = RsqrtHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RsqrtHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct RsqrtContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!RsqrtOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = rsqrt_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct RsqrtTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::rsqrt(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename RsqrtOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class rsqrt_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    rsqrt_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, RsqrtOutputType, RsqrtStridedFunctor, rsqrt_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct RsqrtStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!RsqrtOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = rsqrt_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::rsqrt
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
new file mode 100644
index 000000000000..ceb3d1320f9c
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
@@ -0,0 +1,258 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SIGN(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "cabs_impl.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::sign
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct SignFunctor
+{
+    static_assert(std::is_same_v<resT, argT>);
+    using is_constant = typename std::false_type;
+    // constexpr resT constant_value = resT{};
+    using supports_vec = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+    using supports_sg_loadstore = std::false_type;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (std::is_integral_v<argT>) {
+            if constexpr (std::is_unsigned_v<argT>) {
+                return resT(0 < in);
+            }
+            else {
+                return sign_impl<argT>(in);
+            }
+        }
+        else {
+            if constexpr (is_complex<argT>::value) {
+                using realT = typename argT::value_type;
+
+                if (in == argT(0)) {
+                    return resT(0);
+                }
+                else {
+                    auto z = exprm_ns::complex<realT>(in);
+                    return (z / detail::cabs(in));
+                }
+            }
+            else {
+                if (std::isnan(in)) {
+                    return std::numeric_limits<resT>::quiet_NaN();
+                }
+                else {
+                    return sign_impl<argT>(in);
+                }
+            }
+        }
+    }
+
+private:
+    template <typename T>
+    T sign_impl(const T &v) const
+    {
+        return (T(0) < v) - (v < T(0));
+    }
+};
+
+template <typename argT,
+          typename resT = argT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SignContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           SignFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename T>
+struct SignOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct SignContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class sign_contig_kernel;
+
+template <typename argTy>
+sycl::event sign_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using SignHS = hyperparam_detail::SignContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = SignHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SignHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, SignOutputType, SignContigFunctor, sign_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct SignContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SignOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sign_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct SignTypeMapFactory
+{
+    /*! @brief get typeid for output type of sign(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SignOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename argTy, typename resTy, typename IndexerT>
+using SignStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, SignFunctor<argTy, resTy>>;
+
+template <typename T1, typename T2, typename T3>
+class sign_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    sign_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, SignOutputType, SignStridedFunctor, sign_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct SignStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SignOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sign_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::sign
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
new file mode 100644
index 000000000000..65e9e5a202a9
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
@@ -0,0 +1,220 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SIGNBIT(x)
+/// function that tests whether the sign bit of the tensor element is set.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::signbit
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct SignbitFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using is_constant = std::false_type;
+    static constexpr resT constant_value = false;
+    using supports_vec = std::true_type;
+    using supports_sg_loadstore = std::true_type;
+
+    resT operator()(const argT &in) const { return std::signbit(in); }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = sycl::signbit(in);
+
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+
+        return vec_cast<resT, deducedT, vec_sz>(res_vec);
+    }
+};
+
+template <typename argT,
+          typename resT = bool,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SignbitContigFunctor =
+    elementwise_common::UnaryContigFunctor<argT,
+                                           resT,
+                                           SignbitFunctor<argT, resT>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using SignbitStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, SignbitFunctor<argTy, resTy>>;
+
+template <typename argTy>
+struct SignbitOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<argTy, sycl::half, bool>,
+        td_ns::TypeMapResultEntry<argTy, float, bool>,
+        td_ns::TypeMapResultEntry<argTy, double, bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct SignbitContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class signbit_contig_kernel;
+
+template <typename argTy>
+sycl::event signbit_contig_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const char *arg_p,
+                                char *res_p,
+                                const std::vector<sycl::event> &depends = {})
+{
+    using SignbitHS = hyperparam_detail::SignbitContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = SignbitHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SignbitHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, SignbitOutputType, SignbitContigFunctor, signbit_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct SignbitContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SignbitOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = signbit_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct SignbitTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::isinf(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SignbitOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class signbit_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    signbit_strided_impl(sycl::queue &exec_q,
+                         std::size_t nelems,
+                         int nd,
+                         const ssize_t *shape_and_strides,
+                         const char *arg_p,
+                         ssize_t arg_offset,
+                         char *res_p,
+                         ssize_t res_offset,
+                         const std::vector<sycl::event> &depends,
+                         const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<argTy, SignbitOutputType,
+                                                  SignbitStridedFunctor,
+                                                  signbit_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct SignbitStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SignbitOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = signbit_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::signbit
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
new file mode 100644
index 000000000000..d1e3caa9effe
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
@@ -0,0 +1,333 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SIN(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::sin
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct SinFunctor
+{
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            realT const &in_re = std::real(in);
+            realT const &in_im = std::imag(in);
+
+            const bool in_re_finite = std::isfinite(in_re);
+            const bool in_im_finite = std::isfinite(in_im);
+            /*
+             * Handle the nearly-non-exceptional cases where
+             * real and imaginary parts of input are finite.
+             */
+            if (in_re_finite && in_im_finite) {
+                resT res =
+                    exprm_ns::sin(exprm_ns::complex<realT>(in)); // sin(in);
+                if (in_re == realT(0)) {
+                    res.real(sycl::copysign(realT(0), in_re));
+                }
+                return res;
+            }
+
+            /*
+             * since sin(in) = -I * sinh(I * in), for special cases,
+             * we calculate real and imaginary parts of z = sinh(I * in) and
+             * then return { imag(z) , -real(z) } which is sin(in).
+             */
+            const realT x = -in_im;
+            const realT y = in_re;
+            const bool xfinite = in_im_finite;
+            const bool yfinite = in_re_finite;
+            /*
+             * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as dNaN.
+             *
+             * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as d(NaN).
+             */
+            if (x == realT(0) && !yfinite) {
+                const realT sinh_im = q_nan;
+                const realT sinh_re = sycl::copysign(realT(0), x * sinh_im);
+                return resT{sinh_im, -sinh_re};
+            }
+
+            /*
+             * sinh(+-Inf +- I 0) = +-Inf + I +-0.
+             *
+             * sinh(NaN +- I 0)   = d(NaN) + I +-0.
+             */
+            if (y == realT(0) && !xfinite) {
+                if (std::isnan(x)) {
+                    const realT sinh_re = x;
+                    const realT sinh_im = y;
+                    return resT{sinh_im, -sinh_re};
+                }
+                const realT sinh_re = x;
+                const realT sinh_im = sycl::copysign(realT(0), y);
+                return resT{sinh_im, -sinh_re};
+            }
+
+            /*
+             * sinh(x +- I Inf) = dNaN + I dNaN.
+             *
+             * sinh(x + I NaN) = d(NaN) + I d(NaN).
+             */
+            if (xfinite && !yfinite) {
+                const realT sinh_re = q_nan;
+                const realT sinh_im = x * sinh_re;
+                return resT{sinh_im, -sinh_re};
+            }
+
+            /*
+             * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
+             * The sign of Inf in the result is unspecified.  Choice = normally
+             * the same as d(NaN).
+             *
+             * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
+             * The sign of Inf in the result is unspecified.
+             * Choice = always - here for sinh to have positive result for
+             * imaginary part of sin.
+             *
+             * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
+             */
+            if (std::isinf(x)) {
+                if (!yfinite) {
+                    const realT sinh_re = -x * x;
+                    const realT sinh_im = x * (y - y);
+                    return resT{sinh_im, -sinh_re};
+                }
+                const realT sinh_re = x * sycl::cos(y);
+                const realT sinh_im =
+                    std::numeric_limits<realT>::infinity() * sycl::sin(y);
+                return resT{sinh_im, -sinh_re};
+            }
+
+            /*
+             * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
+             *
+             * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
+             *
+             * sinh(NaN + I y)    = d(NaN) + I d(NaN).
+             */
+            const realT y_m_y = (y - y);
+            const realT sinh_re = (x * x) * y_m_y;
+            const realT sinh_im = (x + x) * y_m_y;
+            return resT{sinh_im, -sinh_re};
+        }
+        else {
+            static_assert(std::is_same_v<argT, resT>);
+            if (in == 0) {
+                return in;
+            }
+            return sycl::sin(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SinContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           SinFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using SinStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, SinFunctor<argTy, resTy>>;
+
+template <typename T>
+struct SinOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct SinContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class sin_contig_kernel;
+
+template <typename argTy>
+sycl::event sin_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using SinHS = hyperparam_detail::SinContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = SinHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SinHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, SinOutputType, SinContigFunctor, sin_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct SinContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SinOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sin_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct SinTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::sin(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SinOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class sin_strided_kernel;
+
+template <typename argTy>
+sycl::event sin_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, SinOutputType, SinStridedFunctor, sin_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct SinStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SinOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sin_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::sin
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
new file mode 100644
index 000000000000..f81a2730fd17
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
@@ -0,0 +1,302 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SINH(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::sinh
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct SinhFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+
+            const bool xfinite = std::isfinite(x);
+            const bool yfinite = std::isfinite(y);
+
+            /*
+             * Handle the nearly-non-exceptional cases where
+             * real and imaginary parts of input are finite.
+             */
+            if (xfinite && yfinite) {
+                return exprm_ns::sinh(exprm_ns::complex<realT>(in));
+            }
+            /*
+             * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as dNaN.
+             *
+             * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
+             * The sign of 0 in the result is unspecified.  Choice = normally
+             * the same as d(NaN).
+             */
+            if (x == realT(0) && !yfinite) {
+                const realT res_re = sycl::copysign(realT(0), x * (y - y));
+                return resT{res_re, y - y};
+            }
+
+            /*
+             * sinh(+-Inf +- I 0) = +-Inf + I +-0.
+             *
+             * sinh(NaN +- I 0)   = d(NaN) + I +-0.
+             */
+            if (y == realT(0) && !xfinite) {
+                if (std::isnan(x)) {
+                    return resT{x, y};
+                }
+                const realT res_im = sycl::copysign(realT(0), y);
+                return resT{x, res_im};
+            }
+
+            /*
+             * sinh(x +- I Inf) = dNaN + I dNaN.
+             *
+             * sinh(x + I NaN) = d(NaN) + I d(NaN).
+             */
+            if (xfinite && !yfinite) {
+                return resT{y - y, x * (y - y)};
+            }
+
+            /*
+             * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
+             * The sign of Inf in the result is unspecified.  Choice = normally
+             * the same as d(NaN).
+             *
+             * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
+             * The sign of Inf in the result is unspecified.  Choice = always +.
+             *
+             * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
+             */
+            if (!xfinite && !std::isnan(x)) {
+                if (!yfinite) {
+                    return resT{x * x, x * (y - y)};
+                }
+                return resT{x * sycl::cos(y),
+                            std::numeric_limits<realT>::infinity() *
+                                sycl::sin(y)};
+            }
+
+            /*
+             * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
+             *
+             * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
+             *
+             * sinh(NaN + I y)    = d(NaN) + I d(NaN).
+             */
+            return resT{(x * x) * (y - y), (x + x) * (y - y)};
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::sinh(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SinhContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           SinhFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using SinhStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, SinhFunctor<argTy, resTy>>;
+
+template <typename T>
+struct SinhOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct SinhContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class sinh_contig_kernel;
+
+template <typename argTy>
+sycl::event sinh_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using SinhHS = hyperparam_detail::SinhContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = SinhHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SinhHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, SinhOutputType, SinhContigFunctor, sinh_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct SinhContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SinhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sinh_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct SinhTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::sinh(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SinhOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class sinh_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    sinh_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, SinhOutputType, SinhStridedFunctor, sinh_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct SinhStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SinhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sinh_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::sinh
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
new file mode 100644
index 000000000000..08b3b092d1ca
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -0,0 +1,224 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SQRT(x)
+/// function that computes a square root.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::sqrt
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct SqrtFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+            return exprm_ns::sqrt(exprm_ns::complex<realT>(in));
+        }
+        else {
+            return sycl::sqrt(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SqrtContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           SqrtFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using SqrtStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, SqrtFunctor<argTy, resTy>>;
+
+template <typename T>
+struct SqrtOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
+        td_ns::
+            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct SqrtContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class sqrt_contig_kernel;
+
+template <typename argTy>
+sycl::event sqrt_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using SqrtHS = hyperparam_detail::SqrtContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = SqrtHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SqrtHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct SqrtContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SqrtOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sqrt_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct SqrtTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::sqrt(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SqrtOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class sqrt_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    sqrt_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, SqrtOutputType, SqrtStridedFunctor, sqrt_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct SqrtStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SqrtOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = sqrt_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::sqrt
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
new file mode 100644
index 000000000000..de3007acfbea
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
@@ -0,0 +1,251 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SQUARE(x)
+///
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::square
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT, typename resT>
+struct SquareFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            auto z = exprm_ns::complex<realT>(in);
+
+            return z * z;
+        }
+        else {
+            return in * in;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
+    {
+        auto const &res_vec = in * in;
+        using deducedT = typename std::remove_cv_t<
+            std::remove_reference_t<decltype(res_vec)>>::element_type;
+        if constexpr (std::is_same_v<resT, deducedT>) {
+            return res_vec;
+        }
+        else {
+            return vec_cast<resT, deducedT, vec_sz>(res_vec);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SquareContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           SquareFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using SquareStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, SquareFunctor<argTy, resTy>>;
+
+template <typename T>
+struct SquareOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, bool, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint8_t>,
+        td_ns::TypeMapResultEntry<T, std::uint16_t>,
+        td_ns::TypeMapResultEntry<T, std::uint32_t>,
+        td_ns::TypeMapResultEntry<T, std::uint64_t>,
+        td_ns::TypeMapResultEntry<T, std::int8_t>,
+        td_ns::TypeMapResultEntry<T, std::int16_t>,
+        td_ns::TypeMapResultEntry<T, std::int32_t>,
+        td_ns::TypeMapResultEntry<T, std::int64_t>,
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct SquareContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class square_contig_kernel;
+
+template <typename argTy>
+sycl::event square_contig_impl(sycl::queue &exec_q,
+                               std::size_t nelems,
+                               const char *arg_p,
+                               char *res_p,
+                               const std::vector<sycl::event> &depends = {})
+{
+    using SquareHS = hyperparam_detail::SquareContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = SquareHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SquareHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, SquareOutputType, SquareContigFunctor, square_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct SquareContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SquareOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = square_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct SquareTypeMapFactory
+{
+    /*! @brief get typeid for output type of x * x */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SquareOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class square_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    square_strided_impl(sycl::queue &exec_q,
+                        std::size_t nelems,
+                        int nd,
+                        const ssize_t *shape_and_strides,
+                        const char *arg_p,
+                        ssize_t arg_offset,
+                        char *res_p,
+                        ssize_t res_offset,
+                        const std::vector<sycl::event> &depends,
+                        const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, SquareOutputType, SquareStridedFunctor, square_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct SquareStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SquareOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = square_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::square
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
new file mode 100644
index 000000000000..431596594ad3
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -0,0 +1,640 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of SUBTRACT(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::kernels::subtract
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct SubtractFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return in1 - in2;
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = in1 - in2;
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SubtractContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            SubtractFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using SubtractStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    SubtractFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct SubtractOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct SubtractContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class subtract_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event subtract_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg1_p,
+                                 ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 ssize_t arg2_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using SubHS =
+        hyperparam_detail::SubtractContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = SubHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SubHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, SubtractOutputType, SubtractContigFunctor,
+        subtract_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SubtractOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class subtract_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    subtract_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg1_p,
+                          ssize_t arg1_offset,
+                          const char *arg2_p,
+                          ssize_t arg2_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, SubtractOutputType, SubtractStridedFunctor,
+        subtract_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                 arg1_offset, arg2_p, arg2_offset, res_p,
+                                 res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+using SubtractContigMatrixContigRowBroadcastingFunctor =
+    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        SubtractFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+using SubtractContigRowContigMatrixBroadcastingFunctor =
+    elementwise_common::BinaryContigRowContigMatrixBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        SubtractFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+class subtract_matrix_row_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+class subtract_row_matrix_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event subtract_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] - vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, SubtractContigMatrixContigRowBroadcastingFunctor,
+        subtract_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
+                                              mat_offset, vec_p, vec_offset,
+                                              res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractContigMatrixContigRowBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename SubtractOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    subtract_contig_matrix_contig_row_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event subtract_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = op(vec[j], mat[i,j])
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl<
+        argT1, argT2, resT, SubtractContigRowContigMatrixBroadcastingFunctor,
+        subtract_row_matrix_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, vec_p,
+                                              vec_offset, mat_p, mat_offset,
+                                              res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractContigRowContigMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename SubtractOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    subtract_contig_row_contig_matrix_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct SubtractInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in) { res -= in; }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        res -= in;
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SubtractInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        SubtractInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using SubtractInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        SubtractInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class subtract_inplace_contig_kernel;
+
+/* @brief Types supported by in-place subtraction */
+template <typename argTy, typename resTy>
+struct SubtractInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    resTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    resTy,
+                                    std::complex<double>>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct SubtractInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x -= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (SubtractInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    subtract_inplace_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg_p,
+                                 ssize_t arg_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using SubHS =
+        hyperparam_detail::SubtractContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = SubHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SubHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, SubtractInplaceContigFunctor,
+        subtract_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class subtract_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event subtract_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, SubtractInplaceStridedFunctor,
+        subtract_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
+                                         arg_p, arg_offset, res_p, res_offset,
+                                         depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+class subtract_inplace_row_matrix_broadcast_sg_krn;
+
+template <typename argT, typename resT>
+using SubtractInplaceRowMatrixBroadcastingFunctor =
+    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
+        argT,
+        resT,
+        SubtractInplaceFunctor<argT, resT>>;
+
+template <typename argT, typename resT>
+sycl::event subtract_inplace_row_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
+        argT, resT, SubtractInplaceRowMatrixBroadcastingFunctor,
+        subtract_inplace_row_matrix_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset,
+        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractInplaceRowMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value) {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn = subtract_inplace_row_matrix_broadcast_impl<T1, T2>;
+                return fn;
+            }
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::subtract
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
new file mode 100644
index 000000000000..5cadec6ce2a4
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
@@ -0,0 +1,44 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines a macro for defining the SYCL_EXT_ONEAPI_COMPLEX macro
+/// and indirect inclusion of the experimental oneAPI SYCL complex extension
+/// header file.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#define SYCL_EXT_ONEAPI_COMPLEX
+#if __has_include(<sycl/ext/oneapi/experimental/sycl_complex.hpp>)
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#else
+#include <sycl/ext/oneapi/experimental/complex/complex.hpp>
+#endif
+
+namespace exprm_ns = sycl::ext::oneapi::experimental;
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
new file mode 100644
index 000000000000..2db2a6b5fbf8
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
@@ -0,0 +1,276 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of TAN(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::tan
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct TanFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+            /*
+             * since tan(in) = -I * tanh(I * in), for special cases,
+             * we calculate real and imaginary parts of z = tanh(I * in) and
+             * return { imag(z) , -real(z) } which is tan(in).
+             */
+            const realT x = -std::imag(in);
+            const realT y = std::real(in);
+            /*
+             * tanh(NaN + i 0) = NaN + i 0
+             *
+             * tanh(NaN + i y) = NaN + i NaN        for y != 0
+             *
+             * The imaginary part has the sign of x*sin(2*y), but there's no
+             * special effort to get this right.
+             *
+             * tanh(+-Inf +- i Inf) = +-1 +- 0
+             *
+             * tanh(+-Inf + i y) = +-1 + 0 sin(2y)        for y finite
+             *
+             * The imaginary part of the sign is unspecified.  This special
+             * case is only needed to avoid a spurious invalid exception when
+             * y is infinite.
+             */
+            if (!std::isfinite(x)) {
+                if (std::isnan(x)) {
+                    const realT tanh_re = x;
+                    const realT tanh_im = (y == realT(0) ? y : x * y);
+                    return resT{tanh_im, -tanh_re};
+                }
+                const realT tanh_re = sycl::copysign(realT(1), x);
+                const realT tanh_im = sycl::copysign(
+                    realT(0), std::isinf(y) ? y : sycl::sin(y) * sycl::cos(y));
+                return resT{tanh_im, -tanh_re};
+            }
+            /*
+             * tanh(x + i NAN) = NaN + i NaN for non-zero x
+             * tanh(x +- i Inf) = NaN + i NaN for non-zero x
+             * tanh(0 + i NAN) = 0 + i NaN
+             * tanh(0 +- i Inf) = 0 + i NaN
+             */
+            if (!std::isfinite(y)) {
+                if (x == realT(0)) {
+                    return resT{q_nan, x};
+                }
+                return resT{q_nan, q_nan};
+            }
+            /* ordinary cases */
+            return exprm_ns::tan(exprm_ns::complex<realT>(in)); // tan(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::tan(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using TanContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           TanFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using TanStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, TanFunctor<argTy, resTy>>;
+
+template <typename T>
+struct TanOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct TanContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class tan_contig_kernel;
+
+template <typename argTy>
+sycl::event tan_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            char *res_p,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using TanHS = hyperparam_detail::TanContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = TanHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = TanHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, TanOutputType, TanContigFunctor, tan_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct TanContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!TanOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = tan_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct TanTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::tan(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename TanOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class tan_strided_kernel;
+
+template <typename argTy>
+sycl::event tan_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, TanOutputType, TanStridedFunctor, tan_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct TanStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!TanOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = tan_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::tan
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
new file mode 100644
index 000000000000..dde16128fb1a
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
@@ -0,0 +1,270 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of TANH(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::tanh
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct TanhFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            static constexpr realT q_nan =
+                std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(in);
+            const realT y = std::imag(in);
+            /*
+             * tanh(NaN + i 0) = NaN + i 0
+             *
+             * tanh(NaN + i y) = NaN + i NaN        for y != 0
+             *
+             * The imaginary part has the sign of x*sin(2*y), but there's no
+             * special effort to get this right.
+             *
+             * tanh(+-Inf +- i Inf) = +-1 +- 0
+             *
+             * tanh(+-Inf + i y) = +-1 + 0 sin(2y)        for y finite
+             *
+             * The imaginary part of the sign is unspecified.  This special
+             * case is only needed to avoid a spurious invalid exception when
+             * y is infinite.
+             */
+            if (!std::isfinite(x)) {
+                if (std::isnan(x)) {
+                    return resT{q_nan, (y == realT(0) ? y : q_nan)};
+                }
+                const realT res_re = sycl::copysign(realT(1), x);
+                const realT res_im = sycl::copysign(
+                    realT(0), std::isinf(y) ? y : sycl::sin(y) * sycl::cos(y));
+                return resT{res_re, res_im};
+            }
+            /*
+             * tanh(x + i NAN) = NaN + i NaN for non-zero x
+             * tanh(x +- i Inf) = NaN + i NaN for non-zero x
+             * tanh(0 + i NAN) = 0 + i NaN
+             * tanh(0 +- i Inf) = 0 + i NaN
+             */
+            if (!std::isfinite(y)) {
+                if (x == realT(0)) {
+                    return resT{x, q_nan};
+                }
+                return resT{q_nan, q_nan};
+            }
+            /* ordinary cases */
+            return exprm_ns::tanh(exprm_ns::complex<realT>(in)); // tanh(in);
+        }
+        else {
+            static_assert(std::is_floating_point_v<argT> ||
+                          std::is_same_v<argT, sycl::half>);
+            return sycl::tanh(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using TanhContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           TanhFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using TanhStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, TanhFunctor<argTy, resTy>>;
+
+template <typename T>
+struct TanhOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct TanhContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class tanh_contig_kernel;
+
+template <typename argTy>
+sycl::event tanh_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    using TanhHS = hyperparam_detail::TanhContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = TanhHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = TanhHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, TanhOutputType, TanhContigFunctor, tanh_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct TanhContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!TanhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = tanh_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct TanhTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::tanh(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename TanhOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class tanh_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    tanh_strided_impl(sycl::queue &exec_q,
+                      std::size_t nelems,
+                      int nd,
+                      const ssize_t *shape_and_strides,
+                      const char *arg_p,
+                      ssize_t arg_offset,
+                      char *res_p,
+                      ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, TanhOutputType, TanhStridedFunctor, tanh_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct TanhStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!TanhOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = tanh_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::tanh
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
new file mode 100644
index 000000000000..caa1cd2029c4
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -0,0 +1,662 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of TRUE_DIVIDE(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::true_divide
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct TrueDivideFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value) {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<realT1>(in1) /
+                   exprm_ns::complex<realT2>(in2);
+        }
+        else if constexpr (tu_ns::is_complex<argT1>::value &&
+                           !tu_ns::is_complex<argT2>::value) {
+            using realT1 = typename argT1::value_type;
+
+            return exprm_ns::complex<realT1>(in1) / in2;
+        }
+        else if constexpr (!tu_ns::is_complex<argT1>::value &&
+                           tu_ns::is_complex<argT2>::value) {
+            using realT2 = typename argT2::value_type;
+
+            return in1 / exprm_ns::complex<realT2>(in2);
+        }
+        else {
+            return in1 / in2;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = in1 / in2;
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using TrueDivideContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    TrueDivideFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using TrueDivideStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    TrueDivideFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct TrueDivideOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        float,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        float,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        double,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        double,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct TrueDivideContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class true_divide_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    true_divide_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using DivHS =
+        hyperparam_detail::TrueDivideContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = DivHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = DivHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, TrueDivideOutputType, TrueDivideContigFunctor,
+        true_divide_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = true_divide_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename TrueDivideOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class true_divide_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    true_divide_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, TrueDivideOutputType, TrueDivideStridedFunctor,
+        true_divide_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = true_divide_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+using TrueDivideContigMatrixContigRowBroadcastingFunctor =
+    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        TrueDivideFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+using TrueDivideContigRowContigMatrixBroadcastingFunctor =
+    elementwise_common::BinaryContigRowContigMatrixBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        TrueDivideFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+class true_divide_matrix_row_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+class true_divide_row_matrix_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event true_divide_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] / vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, TrueDivideContigMatrixContigRowBroadcastingFunctor,
+        true_divide_matrix_row_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideContigMatrixContigRowBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename TrueDivideOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    true_divide_contig_matrix_contig_row_broadcast_impl<T1, T2,
+                                                                        resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event true_divide_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] + vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl<
+        argT1, argT2, resT, TrueDivideContigRowContigMatrixBroadcastingFunctor,
+        true_divide_row_matrix_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset, res_p,
+        res_offset, depends);
+};
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideContigRowContigMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename TrueDivideOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value) {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    true_divide_contig_row_contig_matrix_broadcast_impl<T1, T2,
+                                                                        resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct TrueDivideInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in)
+    {
+        if constexpr (tu_ns::is_complex<resT>::value) {
+            if constexpr (tu_ns::is_complex<argT>::value) {
+                using res_rT = typename resT::value_type;
+                using arg_rT = typename argT::value_type;
+
+                auto res1 = exprm_ns::complex<res_rT>(res);
+                res1 /= exprm_ns::complex<arg_rT>(in);
+                res = res1;
+            }
+            else {
+                using res_rT = typename resT::value_type;
+
+                auto res1 = exprm_ns::complex<res_rT>(res);
+                res1 /= in;
+                res = res1;
+            }
+        }
+        else {
+            res /= in;
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        res /= in;
+    }
+};
+
+/* @brief Types supported by in-place divide */
+template <typename argTy, typename resTy>
+struct TrueDivideInplaceTypePairSupport
+{
+
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    resTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, std::complex<double>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    resTy,
+                                    std::complex<double>>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct TrueDivideInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (TrueDivideInplaceTypePairSupport<argT,
+                                                       resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using TrueDivideInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        TrueDivideInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using TrueDivideInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        TrueDivideInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class true_divide_inplace_contig_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event true_divide_inplace_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    using DivHS =
+        hyperparam_detail::TrueDivideContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = DivHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = DivHS::vec_sz;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, TrueDivideInplaceContigFunctor,
+        true_divide_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = true_divide_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class true_divide_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event true_divide_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, TrueDivideInplaceStridedFunctor,
+        true_divide_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = true_divide_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+class true_divide_inplace_row_matrix_broadcast_sg_krn;
+
+template <typename argT, typename resT>
+using TrueDivideInplaceRowMatrixBroadcastingFunctor =
+    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
+        argT,
+        resT,
+        TrueDivideInplaceFunctor<argT, resT>>;
+
+template <typename argT, typename resT>
+sycl::event true_divide_inplace_row_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
+        argT, resT, TrueDivideInplaceRowMatrixBroadcastingFunctor,
+        true_divide_inplace_row_matrix_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset,
+        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct TrueDivideInplaceRowMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!TrueDivideInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value) {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn = true_divide_inplace_row_matrix_broadcast_impl<T1, T2>;
+                return fn;
+            }
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::true_divide
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
new file mode 100644
index 000000000000..6fae9c4f27e5
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
@@ -0,0 +1,226 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of TRUNC(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::trunc
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT>
+struct TruncFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (std::is_integral_v<argT>) {
+            return in;
+        }
+        else {
+            return sycl::trunc(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using TruncContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           TruncFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs,
+                                           enable_sg_loadstore>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using TruncStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, TruncFunctor<argTy, resTy>>;
+
+template <typename T>
+struct TruncOutputType
+{
+    using value_type =
+        typename std::disjunction<td_ns::TypeMapResultEntry<T, bool>,
+                                  td_ns::TypeMapResultEntry<T, std::uint8_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint16_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint32_t>,
+                                  td_ns::TypeMapResultEntry<T, std::uint64_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int8_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int16_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int32_t>,
+                                  td_ns::TypeMapResultEntry<T, std::int64_t>,
+                                  td_ns::TypeMapResultEntry<T, sycl::half>,
+                                  td_ns::TypeMapResultEntry<T, float>,
+                                  td_ns::TypeMapResultEntry<T, double>,
+                                  td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::ContigHyperparameterSetDefault;
+using vsu_ns::UnaryContigHyperparameterSetEntry;
+
+template <typename argTy>
+struct TruncContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class trunc_contig_kernel;
+
+template <typename argTy>
+sycl::event trunc_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    using TruncHS = hyperparam_detail::TruncContigHyperparameterSet<argTy>;
+    static constexpr std::uint8_t vec_sz = TruncHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = TruncHS::n_vecs;
+
+    return elementwise_common::unary_contig_impl<
+        argTy, TruncOutputType, TruncContigFunctor, trunc_contig_kernel, vec_sz,
+        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T>
+struct TruncContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!TruncOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = trunc_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct TruncTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::trunc(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename TruncOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class trunc_strided_kernel;
+
+template <typename argTy>
+sycl::event
+    trunc_strided_impl(sycl::queue &exec_q,
+                       std::size_t nelems,
+                       int nd,
+                       const ssize_t *shape_and_strides,
+                       const char *arg_p,
+                       ssize_t arg_offset,
+                       char *res_p,
+                       ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, TruncOutputType, TruncStridedFunctor, trunc_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T>
+struct TruncStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!TruncOutputType<T>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = trunc_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::trunc
diff --git a/dpnp/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
new file mode 100644
index 000000000000..bdbc7e50cc86
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
@@ -0,0 +1,70 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utilities for selection of hyperparameters for kernels
+/// implementing unary and binary elementwise functions for contiguous inputs
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+namespace dpctl::tensor::kernels::vec_size_utils
+{
+template <typename Ty1,
+          typename ArgTy1,
+          typename Ty2,
+          typename ArgTy2,
+          std::uint8_t vec_sz_v,
+          std::uint8_t n_vecs_v>
+struct BinaryContigHyperparameterSetEntry
+    : std::conjunction<std::is_same<Ty1, ArgTy1>, std::is_same<Ty2, ArgTy2>>
+{
+    static constexpr std::uint8_t vec_sz = vec_sz_v;
+    static constexpr std::uint8_t n_vecs = n_vecs_v;
+};
+
+template <typename Ty,
+          typename ArgTy,
+          std::uint8_t vec_sz_v,
+          std::uint8_t n_vecs_v>
+struct UnaryContigHyperparameterSetEntry : std::is_same<Ty, ArgTy>
+{
+    static constexpr std::uint8_t vec_sz = vec_sz_v;
+    static constexpr std::uint8_t n_vecs = n_vecs_v;
+};
+
+template <std::uint8_t vec_sz_v, std::uint8_t n_vecs_v>
+struct ContigHyperparameterSetDefault : std::true_type
+{
+    static constexpr std::uint8_t vec_sz = vec_sz_v;
+    static constexpr std::uint8_t n_vecs = n_vecs_v;
+};
+} // namespace dpctl::tensor::kernels::vec_size_utils
diff --git a/dpnp/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpnp/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
new file mode 100644
index 000000000000..f6d2f0175ce8
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -0,0 +1,418 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for advanced tensor index operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/indexing_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::indexing
+{
+
+using dpctl::tensor::ssize_t;
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class TakeFunctor
+{
+private:
+    const char *src_ = nullptr;
+    char *dst_ = nullptr;
+    char **ind_ = nullptr;
+    int k_ = 0;
+    std::size_t ind_nelems_ = 0;
+    const ssize_t *axes_shape_and_strides_ = nullptr;
+    OrthogIndexer orthog_strider;
+    IndicesIndexer ind_strider;
+    AxesIndexer axes_strider;
+
+public:
+    TakeFunctor(const char *src_cp,
+                char *dst_cp,
+                char **ind_cp,
+                int k,
+                std::size_t ind_nelems,
+                const ssize_t *axes_shape_and_strides,
+                const OrthogIndexer &orthog_strider_,
+                const IndicesIndexer &ind_strider_,
+                const AxesIndexer &axes_strider_)
+        : src_(src_cp), dst_(dst_cp), ind_(ind_cp), k_(k),
+          ind_nelems_(ind_nelems),
+          axes_shape_and_strides_(axes_shape_and_strides),
+          orthog_strider(orthog_strider_), ind_strider(ind_strider_),
+          axes_strider(axes_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        const T *src = reinterpret_cast<const T *>(src_);
+        T *dst = reinterpret_cast<T *>(dst_);
+
+        ssize_t i_orthog = id / ind_nelems_;
+        ssize_t i_along = id - (i_orthog * ind_nelems_);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+
+        ssize_t src_offset = orthog_offsets.get_first_offset();
+        ssize_t dst_offset = orthog_offsets.get_second_offset();
+
+        static constexpr ProjectorT proj{};
+        for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
+            indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
+
+            ssize_t ind_offset = ind_strider(i_along, axis_idx);
+            // proj produces an index in the range of the given axis
+            ssize_t projected_idx =
+                proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]);
+            src_offset +=
+                projected_idx * axes_shape_and_strides_[k_ + axis_idx];
+        }
+
+        dst_offset += axes_strider(i_along);
+
+        dst[dst_offset] = src[src_offset];
+    }
+};
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class take_kernel;
+
+typedef sycl::event (*take_fn_ptr_t)(sycl::queue &,
+                                     std::size_t,
+                                     std::size_t,
+                                     int,
+                                     int,
+                                     int,
+                                     const ssize_t *,
+                                     const ssize_t *,
+                                     const ssize_t *,
+                                     const char *,
+                                     char *,
+                                     char **,
+                                     ssize_t,
+                                     ssize_t,
+                                     const ssize_t *,
+                                     const std::vector<sycl::event> &);
+
+template <typename ProjectorT, typename Ty, typename indT>
+sycl::event take_impl(sycl::queue &q,
+                      std::size_t orthog_nelems,
+                      std::size_t ind_nelems,
+                      int nd,
+                      int ind_nd,
+                      int k,
+                      const ssize_t *orthog_shape_and_strides,
+                      const ssize_t *axes_shape_and_strides,
+                      const ssize_t *ind_shape_and_strides,
+                      const char *src_p,
+                      char *dst_p,
+                      char **ind_p,
+                      ssize_t src_offset,
+                      ssize_t dst_offset,
+                      const ssize_t *ind_offsets,
+                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event take_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using OrthogIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        const OrthogIndexerT orthog_indexer{nd, src_offset, dst_offset,
+                                            orthog_shape_and_strides};
+
+        using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset;
+        const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets,
+                                                ind_shape_and_strides};
+
+        using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const AxesIndexerT axes_indexer{ind_nd, 0,
+                                        axes_shape_and_strides + (2 * k)};
+
+        using KernelName =
+            take_kernel<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                        AxesIndexerT, Ty, indT>;
+
+        const std::size_t gws = orthog_nelems * ind_nelems;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(gws),
+            TakeFunctor<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                        AxesIndexerT, Ty, indT>(
+                src_p, dst_p, ind_p, k, ind_nelems, axes_shape_and_strides,
+                orthog_indexer, indices_indexer, axes_indexer));
+    });
+
+    return take_ev;
+}
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class PutFunctor
+{
+private:
+    char *dst_ = nullptr;
+    const char *val_ = nullptr;
+    char **ind_ = nullptr;
+    int k_ = 0;
+    std::size_t ind_nelems_ = 0;
+    const ssize_t *axes_shape_and_strides_ = nullptr;
+    OrthogIndexer orthog_strider;
+    IndicesIndexer ind_strider;
+    AxesIndexer axes_strider;
+
+public:
+    PutFunctor(char *dst_cp,
+               const char *val_cp,
+               char **ind_cp,
+               int k,
+               std::size_t ind_nelems,
+               const ssize_t *axes_shape_and_strides,
+               const OrthogIndexer &orthog_strider_,
+               const IndicesIndexer &ind_strider_,
+               const AxesIndexer &axes_strider_)
+        : dst_(dst_cp), val_(val_cp), ind_(ind_cp), k_(k),
+          ind_nelems_(ind_nelems),
+          axes_shape_and_strides_(axes_shape_and_strides),
+          orthog_strider(orthog_strider_), ind_strider(ind_strider_),
+          axes_strider(axes_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        T *dst = reinterpret_cast<T *>(dst_);
+        const T *val = reinterpret_cast<const T *>(val_);
+
+        ssize_t i_orthog = id / ind_nelems_;
+        ssize_t i_along = id - (i_orthog * ind_nelems_);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+
+        ssize_t dst_offset = orthog_offsets.get_first_offset();
+        ssize_t val_offset = orthog_offsets.get_second_offset();
+
+        static constexpr ProjectorT proj{};
+        for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
+            indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
+
+            ssize_t ind_offset = ind_strider(i_along, axis_idx);
+
+            // proj produces an index in the range of the given axis
+            ssize_t projected_idx =
+                proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]);
+            dst_offset +=
+                projected_idx * axes_shape_and_strides_[k_ + axis_idx];
+        }
+
+        val_offset += axes_strider(i_along);
+
+        dst[dst_offset] = val[val_offset];
+    }
+};
+
+template <typename ProjectorT,
+          typename OrthogIndexer,
+          typename IndicesIndexer,
+          typename AxesIndexer,
+          typename T,
+          typename indT>
+class put_kernel;
+
+typedef sycl::event (*put_fn_ptr_t)(sycl::queue &,
+                                    std::size_t,
+                                    std::size_t,
+                                    int,
+                                    int,
+                                    int,
+                                    const ssize_t *,
+                                    const ssize_t *,
+                                    const ssize_t *,
+                                    char *,
+                                    const char *,
+                                    char **,
+                                    ssize_t,
+                                    ssize_t,
+                                    const ssize_t *,
+                                    const std::vector<sycl::event> &);
+
+template <typename ProjectorT, typename Ty, typename indT>
+sycl::event put_impl(sycl::queue &q,
+                     std::size_t orthog_nelems,
+                     std::size_t ind_nelems,
+                     int nd,
+                     int ind_nd,
+                     int k,
+                     const ssize_t *orthog_shape_and_strides,
+                     const ssize_t *axes_shape_and_strides,
+                     const ssize_t *ind_shape_and_strides,
+                     char *dst_p,
+                     const char *val_p,
+                     char **ind_p,
+                     ssize_t dst_offset,
+                     ssize_t val_offset,
+                     const ssize_t *ind_offsets,
+                     const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event put_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using OrthogIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        const OrthogIndexerT orthog_indexer{nd, dst_offset, val_offset,
+                                            orthog_shape_and_strides};
+
+        using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset;
+        const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets,
+                                                ind_shape_and_strides};
+
+        using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const AxesIndexerT axes_indexer{ind_nd, 0,
+                                        axes_shape_and_strides + (2 * k)};
+
+        using KernelName =
+            put_kernel<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                       AxesIndexerT, Ty, indT>;
+
+        const std::size_t gws = orthog_nelems * ind_nelems;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(gws),
+            PutFunctor<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
+                       AxesIndexerT, Ty, indT>(
+                dst_p, val_p, ind_p, k, ind_nelems, axes_shape_and_strides,
+                orthog_indexer, indices_indexer, axes_indexer));
+    });
+
+    return put_ev;
+}
+
+template <typename fnT, typename T, typename indT>
+struct TakeWrapFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::WrapIndex;
+            fnT fn = take_impl<WrapIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT>
+struct TakeClipFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::ClipIndex;
+            fnT fn = take_impl<ClipIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT>
+struct PutWrapFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::WrapIndex;
+            fnT fn = put_impl<WrapIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T, typename indT>
+struct PutClipFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral<indT>::value &&
+                      !std::is_same<indT, bool>::value) {
+            using dpctl::tensor::indexing_utils::ClipIndex;
+            fnT fn = put_impl<ClipIndex<indT>, T, indT>;
+            return fn;
+        }
+        else {
+            fnT fn = nullptr;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::indexing
diff --git a/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
new file mode 100644
index 000000000000..b987ff2988be
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
@@ -0,0 +1,1399 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for the vector dot product.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/reductions.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+template <typename lhsT,
+          typename rhsT,
+          typename outT,
+          typename BatchIndexerT,
+          typename RedIndexerT>
+struct SequentialDotProduct
+{
+private:
+    const lhsT *lhs_ = nullptr;
+    const rhsT *rhs_ = nullptr;
+    outT *out_ = nullptr;
+    BatchIndexerT batch_indexer_;
+    RedIndexerT reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+
+public:
+    SequentialDotProduct(const lhsT *lhs,
+                         const rhsT *rhs,
+                         outT *out,
+                         BatchIndexerT batch_indexer,
+                         RedIndexerT reduced_dims_indexer,
+                         std::size_t reduction_size)
+        : lhs_(lhs), rhs_(rhs), out_(out), batch_indexer_(batch_indexer),
+          reduced_dims_indexer_(reduced_dims_indexer),
+          reduction_max_gid_(reduction_size)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+
+        auto const &batch_offsets = batch_indexer_(id[0]);
+        const ssize_t &lhs_batch_offset = batch_offsets.get_first_offset();
+        const ssize_t &rhs_batch_offset = batch_offsets.get_second_offset();
+        const ssize_t &out_batch_offset = batch_offsets.get_third_offset();
+
+        outT red_val(0);
+        for (std::size_t m = 0; m < reduction_max_gid_; ++m) {
+            auto reduction_offsets = reduced_dims_indexer_(m);
+            auto lhs_reduction_offset = reduction_offsets.get_first_offset();
+            auto rhs_reduction_offset = reduction_offsets.get_second_offset();
+
+            using dpctl::tensor::type_utils::convert_impl;
+            red_val += convert_impl<outT, lhsT>(
+                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
+                       convert_impl<outT, rhsT>(
+                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
+        }
+
+        out_[out_batch_offset] = red_val;
+    }
+};
+
+template <typename lhsT,
+          typename rhsT,
+          typename outT,
+          typename ReductionOpT,
+          typename BatchIndexerT,
+          typename RedIndexerT>
+struct DotProductFunctor
+{
+private:
+    const lhsT *lhs_ = nullptr;
+    const rhsT *rhs_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOpT reduction_op_;
+    BatchIndexerT batch_indexer_;
+    RedIndexerT reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t batches_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    DotProductFunctor(const lhsT *lhs,
+                      const rhsT *rhs,
+                      outT *res,
+                      const ReductionOpT &reduction_op,
+                      const BatchIndexerT &batch_indexer,
+                      const RedIndexerT &arg_reduced_dims_indexer,
+                      std::size_t reduction_size,
+                      std::size_t iteration_size,
+                      std::size_t reduction_size_per_wi)
+        : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op),
+          batch_indexer_(batch_indexer),
+          reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), batches_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t batch_id = it.get_group(0) % batches_;
+        const std::size_t reduction_batch_id = it.get_group(0) / batches_;
+
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+        // for each input
+
+        const auto &batch_offsets_ = batch_indexer_(batch_id);
+        const auto &lhs_batch_offset = batch_offsets_.get_first_offset();
+        const auto &rhs_batch_offset = batch_offsets_.get_second_offset();
+        const auto &out_batch_offset = batch_offsets_.get_third_offset();
+
+        outT local_red_val(0);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
+            auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
+            const auto &lhs_reduction_offset =
+                reduction_offsets_.get_first_offset();
+            const auto &rhs_reduction_offset =
+                reduction_offsets_.get_second_offset();
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val = convert_impl<outT, lhsT>(
+                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
+                       convert_impl<outT, rhsT>(
+                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
+
+            local_red_val += val;
+        }
+
+        auto work_group = it.get_group();
+        outT red_val_over_wg = sycl::reduce_over_group(
+            work_group, local_red_val, outT(0), reduction_op_);
+
+        if (work_group.leader()) {
+            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                res_ref(out_[out_batch_offset]);
+            res_ref += red_val_over_wg;
+        }
+    }
+};
+
+template <typename lhsT,
+          typename rhsT,
+          typename outT,
+          typename ReductionOpT,
+          typename BatchIndexerT,
+          typename RedIndexerT,
+          typename SlmT>
+struct DotProductCustomFunctor
+{
+private:
+    const lhsT *lhs_ = nullptr;
+    const rhsT *rhs_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOpT reduction_op_;
+    BatchIndexerT batch_indexer_;
+    RedIndexerT reduced_dims_indexer_;
+    SlmT local_mem_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t batches_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    DotProductCustomFunctor(const lhsT *lhs,
+                            const rhsT *rhs,
+                            outT *res,
+                            const ReductionOpT &reduction_op,
+                            const BatchIndexerT &batch_indexer,
+                            const RedIndexerT &arg_reduced_dims_indexer,
+                            SlmT local_mem,
+                            std::size_t reduction_size,
+                            std::size_t iteration_size,
+                            std::size_t reduction_size_per_wi)
+        : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op),
+          batch_indexer_(batch_indexer),
+          reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          batches_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t batch_id = it.get_group(0) % batches_;
+        const std::size_t reduction_batch_id = it.get_group(0) / batches_;
+
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+        // for each input
+
+        const auto &batch_offsets_ = batch_indexer_(batch_id);
+        const auto &lhs_batch_offset = batch_offsets_.get_first_offset();
+        const auto &rhs_batch_offset = batch_offsets_.get_second_offset();
+        const auto &out_batch_offset = batch_offsets_.get_third_offset();
+
+        outT local_red_val(0);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
+            auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
+            const auto &lhs_reduction_offset =
+                reduction_offsets_.get_first_offset();
+            const auto &rhs_reduction_offset =
+                reduction_offsets_.get_second_offset();
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val = convert_impl<outT, lhsT>(
+                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
+                       convert_impl<outT, rhsT>(
+                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
+
+            local_red_val += val;
+        }
+
+        auto work_group = it.get_group();
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                res_ref(out_[out_batch_offset]);
+            res_ref += red_val_over_wg;
+        }
+    }
+};
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename BatchIndexerT,
+          typename RedIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5> class kernel_name_token>
+sycl::event sequential_dot_product(sycl::queue &exec_q,
+                                   const lhsTy *lhs,
+                                   const rhsTy *rhs,
+                                   resTy *res,
+                                   std::size_t batches,
+                                   std::size_t reduction_nelems,
+                                   const BatchIndexerT &batch_indexer,
+                                   const RedIndexerT &reduction_indexer,
+                                   const std::vector<sycl::event> &depends)
+{
+    sycl::event dot_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<
+            kernel_name_token<lhsTy, rhsTy, resTy, BatchIndexerT, RedIndexerT>>(
+            sycl::range<1>(batches),
+            SequentialDotProduct<lhsTy, rhsTy, resTy, BatchIndexerT,
+                                 RedIndexerT>(lhs, rhs, res, batch_indexer,
+                                              reduction_indexer,
+                                              reduction_nelems));
+    });
+
+    return dot_ev;
+}
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename BatchIndexerT,
+          typename RedIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5,
+                    typename T6> class kernel_name_token>
+sycl::event submit_atomic_dot_product(sycl::queue &exec_q,
+                                      const lhsTy *lhs,
+                                      const rhsTy *rhs,
+                                      resTy *res,
+                                      std::size_t wg,
+                                      std::size_t batches,
+                                      std::size_t reduction_nelems,
+                                      std::size_t reductions_per_wi,
+                                      std::size_t reduction_groups,
+                                      const BatchIndexerT &batch_indexer,
+                                      const RedIndexerT &reduction_indexer,
+                                      const std::vector<sycl::event> &depends)
+{
+    sycl::event dot_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        auto globalRange = sycl::range<1>{batches * reduction_groups * wg};
+        auto localRange = sycl::range<1>{wg};
+        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
+
+        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
+            using KernelName =
+                class kernel_name_token<lhsTy, rhsTy, resTy, ReductionOpT,
+                                        BatchIndexerT, RedIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange, DotProductFunctor<lhsTy, rhsTy, resTy, ReductionOpT,
+                                           BatchIndexerT, RedIndexerT>(
+                             lhs, rhs, res, ReductionOpT(), batch_indexer,
+                             reduction_indexer, reduction_nelems, batches,
+                             reductions_per_wi));
+        }
+        else {
+            using SlmT = sycl::local_accessor<resTy, 1>;
+            SlmT local_memory = SlmT(localRange, cgh);
+
+            using KernelName = class custom_reduction_wrapper<kernel_name_token<
+                lhsTy, rhsTy, resTy, ReductionOpT, BatchIndexerT, RedIndexerT>>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                DotProductCustomFunctor<lhsTy, rhsTy, resTy, ReductionOpT,
+                                        BatchIndexerT, RedIndexerT, SlmT>(
+                    lhs, rhs, res, ReductionOpT(), batch_indexer,
+                    reduction_indexer, local_memory, reduction_nelems, batches,
+                    reductions_per_wi));
+        }
+    });
+    return dot_ev;
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class dot_product_seq_krn;
+
+template <typename T1, typename T2, typename T3>
+class dot_product_init_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class dot_product_krn;
+
+typedef sycl::event (*dot_product_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    const char *,
+    char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event dot_product_impl(sycl::queue &exec_q,
+                             std::size_t batches,
+                             std::size_t reduction_nelems,
+                             const char *lhs_cp,
+                             const char *rhs_cp,
+                             char *res_cp,
+                             int batch_nd,
+                             const ssize_t *batch_shape_and_strides,
+                             ssize_t batch_lhs_offset,
+                             ssize_t batch_rhs_offset,
+                             ssize_t batch_res_offset,
+                             int red_nd,
+                             const ssize_t *reduction_shape_stride,
+                             ssize_t reduction_lhs_offset,
+                             ssize_t reduction_rhs_offset,
+                             const std::vector<sycl::event> &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputOutputBatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const InputOutputBatchIndexerT inp_out_batch_indexer{
+            batch_nd, batch_lhs_offset, batch_rhs_offset, batch_res_offset,
+            batch_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset,
+                                                  reduction_rhs_offset,
+                                                  reduction_shape_stride};
+
+        sycl::event dot_ev =
+            sequential_dot_product<lhsTy, rhsTy, resTy,
+                                   InputOutputBatchIndexerT, ReductionIndexerT,
+                                   dot_product_seq_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems,
+                inp_out_batch_indexer, reduction_indexer, depends);
+
+        return dot_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const ssize_t *const &res_shape = batch_shape_and_strides;
+            const ssize_t *const &res_strides =
+                batch_shape_and_strides + 3 * batch_nd;
+            const IndexerT res_indexer(batch_nd, batch_res_offset, res_shape,
+                                       res_strides);
+            using InitKernelName =
+                class dot_product_init_krn<lhsTy, rhsTy, resTy>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(batches), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = 0;
+                });
+        });
+
+        using ReductionOpT = sycl::plus<resTy>;
+
+        using BatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const BatchIndexerT batch_indexer{batch_nd, batch_lhs_offset,
+                                          batch_rhs_offset, batch_res_offset,
+                                          batch_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset,
+                                                  reduction_rhs_offset,
+                                                  reduction_shape_stride};
+
+        static constexpr std::size_t preferred_reductions_per_wi =
+            4; // determined experimentally
+        std::size_t reductions_per_wi =
+            (reduction_nelems < preferred_reductions_per_wi * wg)
+                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
+                : preferred_reductions_per_wi;
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+
+        sycl::event dot_ev =
+            submit_atomic_dot_product<lhsTy, rhsTy, resTy, ReductionOpT,
+                                      BatchIndexerT, ReductionIndexerT,
+                                      dot_product_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems,
+                reductions_per_wi, reduction_groups, batch_indexer,
+                reduction_indexer, {res_init_ev});
+
+        return dot_ev;
+    }
+}
+
+typedef sycl::event (*dot_product_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    const char *,
+    char *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event
+    dot_product_contig_impl(sycl::queue &exec_q,
+                            std::size_t batches,
+                            std::size_t reduction_nelems,
+                            const char *lhs_cp,
+                            const char *rhs_cp,
+                            char *res_cp,
+                            ssize_t batch_lhs_offset,
+                            ssize_t batch_rhs_offset,
+                            ssize_t batch_res_offset,
+                            ssize_t reduction_lhs_offset,
+                            ssize_t reduction_rhs_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp) +
+                          batch_lhs_offset + reduction_lhs_offset;
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp) +
+                          batch_rhs_offset + reduction_rhs_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + batch_res_offset;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputBatchIndexerT =
+            dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputBatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
+                InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+
+        const InputBatchIndexerT inp_batch_indexer{/* size */ batches,
+                                                   /* step */ reduction_nelems};
+        const InputOutputBatchIndexerT inp_out_batch_indexer{
+            inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
+                                                             NoOpIndexerT{}};
+
+        sycl::event dot_ev =
+            sequential_dot_product<lhsTy, rhsTy, resTy,
+                                   InputOutputBatchIndexerT, ReductionIndexerT,
+                                   dot_product_seq_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems,
+                inp_out_batch_indexer, reduction_indexer, depends);
+
+        return dot_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.fill<resTy>(res_tp, resTy(0), batches);
+        });
+
+        using ReductionOpT = sycl::plus<resTy>;
+
+        using InputBatchIndexerT =
+            dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputBatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
+                InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+
+        const InputBatchIndexerT inp_batch_indexer{/* size */ batches,
+                                                   /* step */ reduction_nelems};
+        const InputOutputBatchIndexerT inp_out_batch_indexer{
+            inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
+                                                             NoOpIndexerT{}};
+
+        static constexpr std::size_t preferred_reductions_per_wi =
+            4; // determined experimentally
+        std::size_t reductions_per_wi =
+            (reduction_nelems < preferred_reductions_per_wi * wg)
+                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
+                : preferred_reductions_per_wi;
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+
+        sycl::event dot_ev =
+            submit_atomic_dot_product<lhsTy, rhsTy, resTy, ReductionOpT,
+                                      InputOutputBatchIndexerT,
+                                      ReductionIndexerT, dot_product_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems,
+                reductions_per_wi, reduction_groups, inp_out_batch_indexer,
+                reduction_indexer, {res_init_ev});
+
+        return dot_ev;
+    }
+}
+
+template <typename lhsT,
+          typename rhsT,
+          typename outT,
+          typename ReductionOpT,
+          typename BatchIndexerT,
+          typename RedIndexerT>
+struct DotProductNoAtomicFunctor
+{
+private:
+    const lhsT *lhs_ = nullptr;
+    const rhsT *rhs_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOpT reduction_op_;
+    BatchIndexerT batch_indexer_;
+    RedIndexerT reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t batches_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    DotProductNoAtomicFunctor(const lhsT *lhs,
+                              const rhsT *rhs,
+                              outT *res,
+                              const ReductionOpT &reduction_op,
+                              const BatchIndexerT &batch_indexer,
+                              const RedIndexerT &arg_reduced_dims_indexer,
+                              std::size_t reduction_size,
+                              std::size_t iteration_size,
+                              std::size_t reduction_size_per_wi)
+        : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op),
+          batch_indexer_(batch_indexer),
+          reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), batches_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t batch_id = it.get_group(0) % batches_;
+        const std::size_t reduction_batch_id = it.get_group(0) / batches_;
+        const std::size_t n_reduction_groups = it.get_group_range(0) / batches_;
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+        // for each input
+
+        const auto &batch_offsets_ = batch_indexer_(batch_id);
+        const auto &lhs_batch_offset = batch_offsets_.get_first_offset();
+        const auto &rhs_batch_offset = batch_offsets_.get_second_offset();
+        const auto &out_batch_offset = batch_offsets_.get_third_offset();
+
+        outT local_red_val(0);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
+            auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
+            const auto &lhs_reduction_offset =
+                reduction_offsets_.get_first_offset();
+            const auto &rhs_reduction_offset =
+                reduction_offsets_.get_second_offset();
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val = convert_impl<outT, lhsT>(
+                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
+                       convert_impl<outT, rhsT>(
+                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
+
+            local_red_val += val;
+        }
+
+        auto work_group = it.get_group();
+
+        using RedOpT = typename std::conditional<std::is_same_v<outT, bool>,
+                                                 sycl::logical_or<outT>,
+                                                 sycl::plus<outT>>::type;
+        outT red_val_over_wg = sycl::reduce_over_group(
+            work_group, local_red_val, outT(0), RedOpT());
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            out_[out_batch_offset * n_reduction_groups + reduction_batch_id] =
+                red_val_over_wg;
+        }
+    }
+};
+
+template <typename lhsT,
+          typename rhsT,
+          typename outT,
+          typename ReductionOpT,
+          typename BatchIndexerT,
+          typename RedIndexerT,
+          typename SlmT>
+struct DotProductNoAtomicCustomFunctor
+{
+private:
+    const lhsT *lhs_ = nullptr;
+    const rhsT *rhs_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOpT reduction_op_;
+    BatchIndexerT batch_indexer_;
+    RedIndexerT reduced_dims_indexer_;
+    SlmT local_mem_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t batches_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    DotProductNoAtomicCustomFunctor(const lhsT *lhs,
+                                    const rhsT *rhs,
+                                    outT *res,
+                                    const ReductionOpT &reduction_op,
+                                    const BatchIndexerT &batch_indexer,
+                                    const RedIndexerT &arg_reduced_dims_indexer,
+                                    SlmT local_mem,
+                                    std::size_t reduction_size,
+                                    std::size_t iteration_size,
+                                    std::size_t reduction_size_per_wi)
+        : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op),
+          batch_indexer_(batch_indexer),
+          reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          batches_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t batch_id = it.get_group(0) % batches_;
+        const std::size_t reduction_batch_id = it.get_group(0) / batches_;
+        const std::size_t n_reduction_groups = it.get_group_range(0) / batches_;
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+        // for each input
+
+        const auto &batch_offsets_ = batch_indexer_(batch_id);
+        const auto &lhs_batch_offset = batch_offsets_.get_first_offset();
+        const auto &rhs_batch_offset = batch_offsets_.get_second_offset();
+        const auto &out_batch_offset = batch_offsets_.get_third_offset();
+
+        outT local_red_val(0);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
+            auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
+            const auto &lhs_reduction_offset =
+                reduction_offsets_.get_first_offset();
+            const auto &rhs_reduction_offset =
+                reduction_offsets_.get_second_offset();
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val = convert_impl<outT, lhsT>(
+                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
+                       convert_impl<outT, rhsT>(
+                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
+
+            local_red_val += val;
+        }
+
+        auto work_group = it.get_group();
+
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            out_[out_batch_offset * n_reduction_groups + reduction_batch_id] =
+                red_val_over_wg;
+        }
+    }
+};
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename BatchIndexerT,
+          typename RedIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5,
+                    typename T6> class kernel_name_token>
+sycl::event
+    submit_no_atomic_dot_product(sycl::queue &exec_q,
+                                 const lhsTy *lhs,
+                                 const rhsTy *rhs,
+                                 resTy *res,
+                                 std::size_t wg,
+                                 std::size_t batches,
+                                 std::size_t reduction_nelems,
+                                 std::size_t reductions_per_wi,
+                                 std::size_t reduction_groups,
+                                 const BatchIndexerT &batch_indexer,
+                                 const RedIndexerT &reduction_indexer,
+                                 const std::vector<sycl::event> &depends)
+{
+    sycl::event dot_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        auto globalRange = sycl::range<1>{batches * reduction_groups * wg};
+        auto localRange = sycl::range<1>{wg};
+        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
+
+        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
+            using KernelName =
+                class kernel_name_token<lhsTy, rhsTy, resTy, ReductionOpT,
+                                        BatchIndexerT, RedIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                DotProductNoAtomicFunctor<lhsTy, rhsTy, resTy, ReductionOpT,
+                                          BatchIndexerT, RedIndexerT>(
+                    lhs, rhs, res, ReductionOpT(), batch_indexer,
+                    reduction_indexer, reduction_nelems, batches,
+                    reductions_per_wi));
+        }
+        else {
+            using SlmT = sycl::local_accessor<resTy, 1>;
+            SlmT local_memory = SlmT(localRange, cgh);
+
+            using KernelName = class custom_reduction_wrapper<kernel_name_token<
+                lhsTy, rhsTy, resTy, ReductionOpT, BatchIndexerT, RedIndexerT>>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                DotProductNoAtomicCustomFunctor<lhsTy, rhsTy, resTy,
+                                                ReductionOpT, BatchIndexerT,
+                                                RedIndexerT, SlmT>(
+                    lhs, rhs, res, ReductionOpT(), batch_indexer,
+                    reduction_indexer, local_memory, reduction_nelems, batches,
+                    reductions_per_wi));
+        }
+    });
+    return dot_ev;
+}
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class dot_product_tree_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class dot_product_tree_reduction_krn;
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event dot_product_tree_impl(sycl::queue &exec_q,
+                                  std::size_t batches,
+                                  std::size_t reduction_nelems,
+                                  const char *lhs_cp,
+                                  const char *rhs_cp,
+                                  char *res_cp,
+                                  int batch_nd,
+                                  const ssize_t *batch_shape_and_strides,
+                                  ssize_t batch_lhs_offset,
+                                  ssize_t batch_rhs_offset,
+                                  ssize_t batch_res_offset,
+                                  int red_nd,
+                                  const ssize_t *reduction_shape_stride,
+                                  ssize_t reduction_lhs_offset,
+                                  ssize_t reduction_rhs_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputOutputBatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const InputOutputBatchIndexerT inp_out_batch_indexer{
+            batch_nd, batch_lhs_offset, batch_rhs_offset, batch_res_offset,
+            batch_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset,
+                                                  reduction_rhs_offset,
+                                                  reduction_shape_stride};
+
+        sycl::event dot_ev =
+            sequential_dot_product<lhsTy, rhsTy, resTy,
+                                   InputOutputBatchIndexerT, ReductionIndexerT,
+                                   dot_product_seq_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems,
+                inp_out_batch_indexer, reduction_indexer, depends);
+
+        return dot_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    using ReductionOpT = typename std::conditional<std::is_same_v<resTy, bool>,
+                                                   sycl::logical_or<resTy>,
+                                                   sycl::plus<resTy>>::type;
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        using BatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const BatchIndexerT batch_indexer{batch_nd, batch_lhs_offset,
+                                          batch_rhs_offset, batch_res_offset,
+                                          batch_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset,
+                                                  reduction_rhs_offset,
+                                                  reduction_shape_stride};
+
+        if (batches == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event dot_ev =
+            submit_no_atomic_dot_product<lhsTy, rhsTy, resTy, ReductionOpT,
+                                         BatchIndexerT, ReductionIndexerT,
+                                         dot_product_tree_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems,
+                reductions_per_wi, reduction_groups, batch_indexer,
+                reduction_indexer, depends);
+
+        return dot_ev;
+    }
+    else {
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        // returns unique_ptr
+        auto partially_reduced_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                batches * (reduction_groups + second_iter_reduction_groups_),
+                exec_q);
+
+        resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * batches;
+
+        sycl::event first_reduction_ev;
+        {
+            using LhsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+            using RhsIndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputBatchIndexerT =
+                dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
+                    LhsIndexerT, RhsIndexerT, ResIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+            const LhsIndexerT lhs_indexer(batch_nd, batch_lhs_offset,
+                                          batch_shape_and_strides);
+            const RhsIndexerT rhs_indexer(
+                batch_nd, batch_rhs_offset, batch_shape_and_strides,
+                batch_shape_and_strides + 2 * batch_nd);
+            static constexpr ResIndexerT noop_tmp_indexer{};
+
+            const InputOutputBatchIndexerT in_out_iter_indexer{
+                lhs_indexer, rhs_indexer, noop_tmp_indexer};
+            const ReductionIndexerT reduction_indexer{
+                red_nd, reduction_lhs_offset, reduction_rhs_offset,
+                reduction_shape_stride};
+
+            first_reduction_ev = submit_no_atomic_dot_product<
+                lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT,
+                ReductionIndexerT, dot_product_tree_krn>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, wg, batches,
+                reduction_nelems, preferred_reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ batches,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev =
+                dpctl::tensor::kernels::submit_no_atomic_reduction<
+                    resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, dot_product_tree_reduction_krn>(
+                    exec_q, temp_arg, temp2_arg, identity_val, wg, batches,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ batches,
+                                        /* step */ remaining_reduction_nelems};
+        const ResIndexerT res_iter_indexer{
+            batch_nd, batch_res_offset,
+            /* shape */ batch_shape_and_strides,
+            /* strides */ batch_shape_and_strides + 2 * batch_nd};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            dpctl::tensor::kernels::submit_no_atomic_reduction<
+                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, dot_product_tree_reduction_krn>(
+                exec_q, temp_arg, res_tp, identity_val, wg, batches,
+                remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        // transfer ownership of USM allocation to host_task
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, partially_reduced_tmp_owner);
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event
+    dot_product_contig_tree_impl(sycl::queue &exec_q,
+                                 std::size_t batches,
+                                 std::size_t reduction_nelems,
+                                 const char *lhs_cp,
+                                 const char *rhs_cp,
+                                 char *res_cp,
+                                 ssize_t batch_lhs_offset,
+                                 ssize_t batch_rhs_offset,
+                                 ssize_t batch_res_offset,
+                                 ssize_t reduction_lhs_offset,
+                                 ssize_t reduction_rhs_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp) +
+                          batch_lhs_offset + reduction_lhs_offset;
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp) +
+                          batch_rhs_offset + reduction_rhs_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + batch_res_offset;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputBatchIndexerT =
+            dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputBatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
+                InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+
+        const InputBatchIndexerT inp_batch_indexer{/* size */ batches,
+                                                   /* step */ reduction_nelems};
+        const InputOutputBatchIndexerT inp_out_batch_indexer{
+            inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
+                                                             NoOpIndexerT{}};
+
+        sycl::event dot_ev =
+            sequential_dot_product<lhsTy, rhsTy, resTy,
+                                   InputOutputBatchIndexerT, ReductionIndexerT,
+                                   dot_product_seq_krn>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems,
+                inp_out_batch_indexer, reduction_indexer, depends);
+
+        return dot_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    using ReductionOpT = typename std::conditional<std::is_same_v<resTy, bool>,
+                                                   sycl::logical_or<resTy>,
+                                                   sycl::plus<resTy>>::type;
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        using InputBatchIndexerT =
+            dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputBatchIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
+                InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+
+        const InputBatchIndexerT inp_batch_indexer{/* size */ batches,
+                                                   /* step */ reduction_nelems};
+        const InputOutputBatchIndexerT inp_out_batch_indexer{
+            inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
+                                                             NoOpIndexerT{}};
+
+        if (batches == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event dot_ev = submit_no_atomic_dot_product<
+            lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT,
+            ReductionIndexerT, dot_product_tree_krn>(
+            exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems,
+            reductions_per_wi, reduction_groups, inp_out_batch_indexer,
+            reduction_indexer, depends);
+
+        return dot_ev;
+    }
+    else {
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        // unique_ptr that owns temporary allocation for partial reductions
+        auto partially_reduced_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                batches * (reduction_groups + second_iter_reduction_groups_),
+                exec_q);
+        // get raw pointers
+        resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * batches;
+
+        sycl::event first_reduction_ev;
+        {
+            using InputBatchIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputBatchIndexerT =
+                dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
+                    InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+
+            const InputBatchIndexerT inp_batch_indexer{
+                /* size */ batches,
+                /* step */ reduction_nelems};
+            const InputOutputBatchIndexerT inp_out_batch_indexer{
+                inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
+            static constexpr ReductionIndexerT reduction_indexer{
+                NoOpIndexerT{}, NoOpIndexerT{}};
+
+            first_reduction_ev = submit_no_atomic_dot_product<
+                lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT,
+                ReductionIndexerT, dot_product_tree_krn>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, wg, batches,
+                reduction_nelems, preferred_reductions_per_wi, reduction_groups,
+                inp_out_batch_indexer, reduction_indexer, depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ batches,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev =
+                dpctl::tensor::kernels::submit_no_atomic_reduction<
+                    resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, dot_product_tree_reduction_krn>(
+                    exec_q, temp_arg, temp2_arg, identity_val, wg, batches,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ batches,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            dpctl::tensor::kernels::submit_no_atomic_reduction<
+                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, dot_product_tree_reduction_krn>(
+                exec_q, temp_arg, res_tp, identity_val, wg, batches,
+                remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, partially_reduced_tmp_owner);
+
+        return cleanup_host_task_event;
+    }
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
new file mode 100644
index 000000000000..5644ea172a1d
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
@@ -0,0 +1,4233 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for general matrix multiplication (GEMM).
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <sycl/sycl.hpp>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/reductions.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+
+namespace gemm_detail
+{
+
+template <typename T, std::size_t m_groups>
+void scale_gemm_k_parameters(const std::size_t &local_mem_size,
+                             const std::size_t &reserved_slm_size,
+                             const std::size_t delta_k,
+                             std::size_t &n_wi,
+                             std::size_t &delta_n)
+{
+    static constexpr std::size_t slm_elem_size = sizeof(T) * m_groups;
+
+    while (slm_elem_size * (n_wi + delta_n) * delta_k + reserved_slm_size >=
+           local_mem_size) {
+        n_wi = n_wi / 2;
+        delta_n = delta_n / 2;
+        if (delta_n == 0)
+            throw std::runtime_error("Insufficient resources");
+    }
+}
+
+template <typename T, int wi_delta_m>
+void scale_gemm_nm_parameters(const std::size_t &local_mem_size,
+                              const std::size_t &reserved_slm_size,
+                              const std::size_t &wi_delta_n,
+                              std::size_t &wi_delta_k,
+                              std::size_t &wg_delta_n,
+                              std::size_t &wg_delta_m)
+{
+    static constexpr std::size_t slm_A_elem_size = sizeof(T);
+    static constexpr std::size_t slm_B_elem_size = sizeof(T) * wi_delta_m;
+
+    while ((wi_delta_n * wg_delta_n * wi_delta_k * slm_A_elem_size) +
+               (wi_delta_k * wg_delta_m * slm_B_elem_size) +
+               reserved_slm_size >=
+           local_mem_size) {
+        wg_delta_n /= 2;
+        wg_delta_m /= 2;
+        wi_delta_k /= 2;
+        if (wg_delta_n == 0)
+            throw std::runtime_error("Insufficient resources");
+    }
+}
+} // namespace gemm_detail
+
+using dpctl::tensor::sycl_utils::choose_workgroup_size;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class gemm_seq_reduction_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class gemm_tree_reduction_krn;
+
+template <typename T, typename ReductionOpT>
+sycl::event single_reduction_for_gemm(sycl::queue &exec_q,
+                                      T *tmp_tp,
+                                      T *res_tp,
+                                      T identity_val,
+                                      std::size_t iter_nelems,
+                                      std::size_t reduction_nelems,
+                                      std::size_t reduction_groups,
+                                      std::size_t wg,
+                                      std::size_t max_wg,
+                                      std::size_t preferred_reductions_per_wi,
+                                      std::size_t reductions_per_wi,
+                                      int res_nd,
+                                      ssize_t res_offset,
+                                      const ssize_t *res_shapes_strides,
+                                      const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev;
+    if (reduction_nelems < wg) {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const ResIndexerT res_iter_indexer{res_nd, 0, res_shapes_strides};
+        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                          res_iter_indexer};
+        const ReductionIndexerT reduction_indexer{/* size   */ reduction_nelems,
+                                                  /* step   */ iter_nelems};
+
+        red_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            sycl::range<1> iter_range{iter_nelems};
+
+            cgh.parallel_for<class gemm_seq_reduction_krn<
+                T, T, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                iter_range,
+                SequentialReduction<T, T, ReductionOpT, InputOutputIterIndexerT,
+                                    ReductionIndexerT>(
+                    tmp_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+    }
+    else {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const ResIndexerT res_iter_indexer{res_nd, 0, res_shapes_strides};
+        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                          res_iter_indexer};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        red_ev = dpctl::tensor::kernels::submit_no_atomic_reduction<
+            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+            gemm_tree_reduction_krn>(
+            exec_q, tmp_tp, res_tp, identity_val, wg, iter_nelems,
+            reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, depends);
+    }
+    return red_ev;
+}
+
+template <typename T, typename ReductionOpT>
+sycl::event
+    single_reduction_for_gemm_contig(sycl::queue &exec_q,
+                                     T *tmp_tp,
+                                     T *res_tp,
+                                     T identity_val,
+                                     std::size_t iter_nelems,
+                                     std::size_t reduction_nelems,
+                                     std::size_t reduction_groups,
+                                     std::size_t wg,
+                                     std::size_t max_wg,
+                                     std::size_t preferred_reductions_per_wi,
+                                     std::size_t reductions_per_wi,
+                                     const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev;
+    if (reduction_nelems < wg) {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        static constexpr InputOutputIterIndexerT in_out_iter_indexer{
+            NoOpIndexerT{}, NoOpIndexerT{}};
+        // tmp allocation is a C-contiguous matrix (reduction_nelems,
+        // iter_nelems) and we are reducing by axis 0
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        red_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            sycl::range<1> iter_range{iter_nelems};
+
+            cgh.parallel_for<class gemm_seq_reduction_krn<
+                T, T, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                iter_range,
+                SequentialReduction<T, T, ReductionOpT, InputOutputIterIndexerT,
+                                    ReductionIndexerT>(
+                    tmp_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+    }
+    else {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        static constexpr InputOutputIterIndexerT in_out_iter_indexer{
+            NoOpIndexerT{}, NoOpIndexerT{}};
+        // tmp allocation is a C-contiguous matrix
+        // (reduction_nelems, iter_nelems). Reducing along axis 0
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        red_ev = dpctl::tensor::kernels::submit_no_atomic_reduction<
+            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+            gemm_tree_reduction_krn>(
+            exec_q, tmp_tp, res_tp, identity_val, wg, iter_nelems,
+            reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, depends);
+    }
+    return red_ev;
+}
+
+template <typename T, typename ReductionOpT>
+sycl::event tree_reduction_for_gemm(sycl::queue &exec_q,
+                                    T *partially_reduced_tmp,
+                                    T *partially_reduced_tmp2,
+                                    T *res_tp,
+                                    T identity_val,
+                                    std::size_t iter_nelems,
+                                    std::size_t reduction_nelems,
+                                    std::size_t reduction_groups,
+                                    std::size_t wg,
+                                    std::size_t max_wg,
+                                    std::size_t preferred_reductions_per_wi,
+                                    std::size_t reductions_per_wi,
+                                    int res_nd,
+                                    ssize_t res_offset,
+                                    const ssize_t *res_shape_strides,
+                                    const std::vector<sycl::event> &depends)
+{
+    sycl::event first_reduction_ev;
+    {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        static constexpr InputOutputIterIndexerT in_out_iter_indexer{
+            NoOpIndexerT{}, NoOpIndexerT{}};
+        // partially_reduced_tmp is C-contig matrix with shape
+        // (reduction_nelems, iter_nelems). Reducing along axis 0.
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        first_reduction_ev = dpctl::tensor::kernels::submit_no_atomic_reduction<
+            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+            gemm_tree_reduction_krn>(
+            exec_q, partially_reduced_tmp, partially_reduced_tmp2, identity_val,
+            wg, iter_nelems, reduction_nelems, reductions_per_wi,
+            reduction_groups, in_out_iter_indexer, reduction_indexer, depends);
+    }
+
+    std::size_t remaining_reduction_nelems = reduction_groups;
+
+    T *temp_arg = partially_reduced_tmp2;
+    T *temp2_arg = partially_reduced_tmp;
+    sycl::event dependent_ev = first_reduction_ev;
+
+    while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) {
+        std::size_t reduction_groups_ = (remaining_reduction_nelems +
+                                         preferred_reductions_per_wi * wg - 1) /
+                                        (preferred_reductions_per_wi * wg);
+        assert(reduction_groups_ > 1);
+
+        // keep reducing
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ reduction_groups_};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        sycl::event partial_reduction_ev =
+            dpctl::tensor::kernels::submit_no_atomic_reduction<
+                T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+                gemm_tree_reduction_krn>(
+                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
+                remaining_reduction_nelems, reductions_per_wi,
+                reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                {dependent_ev});
+
+        remaining_reduction_nelems = reduction_groups_;
+        std::swap(temp_arg, temp2_arg);
+        dependent_ev = std::move(partial_reduction_ev);
+    }
+
+    // final reduction to res
+    using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+    using ResIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    using InputOutputIterIndexerT =
+        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<InputIndexerT,
+                                                                ResIndexerT>;
+    using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+    const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                    /* step */ remaining_reduction_nelems};
+    const ResIndexerT res_iter_indexer{
+        /* ndim                */ res_nd,
+        /* offset              */ static_cast<ssize_t>(res_offset),
+        /* packed shape_strides*/ res_shape_strides};
+
+    const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                      res_iter_indexer};
+    static constexpr ReductionIndexerT reduction_indexer{};
+
+    wg = max_wg;
+    reductions_per_wi =
+        std::max<std::size_t>(1, (remaining_reduction_nelems + wg - 1) / wg);
+
+    reduction_groups =
+        (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+        (reductions_per_wi * wg);
+    assert(reduction_groups == 1);
+
+    sycl::event final_reduction_ev =
+        dpctl::tensor::kernels::submit_no_atomic_reduction<
+            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+            gemm_tree_reduction_krn>(
+            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
+            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+    return final_reduction_ev;
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class gemm_reduction_over_group_temps_contig_krn;
+
+template <typename T, typename ReductionOpT>
+sycl::event
+    tree_reduction_for_gemm_contig(sycl::queue &exec_q,
+                                   T *partially_reduced_tmp,
+                                   T *partially_reduced_tmp2,
+                                   T *res_tp,
+                                   T identity_val,
+                                   std::size_t iter_nelems,
+                                   std::size_t reduction_nelems,
+                                   std::size_t reduction_groups,
+                                   std::size_t wg,
+                                   std::size_t max_wg,
+                                   std::size_t preferred_reductions_per_wi,
+                                   std::size_t reductions_per_wi,
+                                   const std::vector<sycl::event> &depends)
+{
+    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    using InputOutputIterIndexerT =
+        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<NoOpIndexerT,
+                                                                NoOpIndexerT>;
+    using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+    static constexpr InputOutputIterIndexerT in_out_iter_indexer{
+        NoOpIndexerT{}, NoOpIndexerT{}};
+    const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                              /* step */ iter_nelems};
+
+    const sycl::event &first_reduction_ev =
+        dpctl::tensor::kernels::submit_no_atomic_reduction<
+            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+            gemm_reduction_over_group_temps_contig_krn>(
+            exec_q, partially_reduced_tmp, partially_reduced_tmp2, identity_val,
+            wg, iter_nelems, reduction_nelems, reductions_per_wi,
+            reduction_groups, in_out_iter_indexer, reduction_indexer, depends);
+
+    std::size_t remaining_reduction_nelems = reduction_groups;
+
+    T *temp_arg = partially_reduced_tmp2;
+    T *temp2_arg = partially_reduced_tmp;
+    sycl::event dependent_ev = first_reduction_ev;
+
+    while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) {
+        std::size_t reduction_groups_ = (remaining_reduction_nelems +
+                                         preferred_reductions_per_wi * wg - 1) /
+                                        (preferred_reductions_per_wi * wg);
+        assert(reduction_groups_ > 1);
+
+        // keep reducing
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        // n * m = iter_nelems because essentially, this process
+        // creates a stack of reduction_nelems 2D matrices and we reduce
+        // along the stack axis
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ reduction_groups_};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        sycl::event partial_reduction_ev =
+            dpctl::tensor::kernels::submit_no_atomic_reduction<
+                T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+                gemm_reduction_over_group_temps_contig_krn>(
+                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
+                remaining_reduction_nelems, reductions_per_wi,
+                reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                {dependent_ev});
+
+        remaining_reduction_nelems = reduction_groups_;
+        std::swap(temp_arg, temp2_arg);
+        dependent_ev = std::move(partial_reduction_ev);
+    }
+
+    // final reduction to res
+    {
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{
+            /* size   */ iter_nelems,
+            /* step   */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            dpctl::tensor::kernels::submit_no_atomic_reduction<
+                T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
+                gemm_reduction_over_group_temps_contig_krn>(
+                exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
+                remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        return final_reduction_ev;
+    }
+}
+
+template <typename lhsT,
+          typename rhsT,
+          typename resT,
+          typename LocAccT,
+          typename OuterInnerDimsIndexerT,
+          typename BatchDimsIndexerT,
+          std::size_t m_groups>
+class GemmBatchFunctorThreadK
+{
+private:
+    const lhsT *lhs = nullptr;
+    const rhsT *rhs = nullptr;
+    resT *res = nullptr;
+    LocAccT workspace;
+    LocAccT local_B_block;
+    std::size_t n = 0;
+    std::size_t n_blocks = 0;
+    std::size_t delta_n = 0;
+    std::size_t k = 0;
+    std::size_t k_blocks = 0;
+    std::size_t delta_k = 0;
+    std::size_t n_wi = 0;
+    std::size_t m = 0;
+    std::size_t batch_nelems = 0;
+    BatchDimsIndexerT batch_indexer;
+    OuterInnerDimsIndexerT lhs_indexer;
+    OuterInnerDimsIndexerT rhs_indexer;
+    OuterInnerDimsIndexerT res_indexer;
+
+public:
+    GemmBatchFunctorThreadK(const lhsT *lhs_,
+                            const rhsT *rhs_,
+                            resT *res_,
+                            LocAccT workspace_,
+                            LocAccT local_B_block_,
+                            std::size_t n_,
+                            std::size_t n_blocks_,
+                            std::size_t delta_n_,
+                            std::size_t k_,
+                            std::size_t k_blocks_,
+                            std::size_t delta_k_,
+                            std::size_t n_wi_,
+                            std::size_t m_,
+                            std::size_t batch_nelems_,
+                            const BatchDimsIndexerT &batch_indexer_,
+                            const OuterInnerDimsIndexerT &lhs_indexer_,
+                            const OuterInnerDimsIndexerT &rhs_indexer_,
+                            const OuterInnerDimsIndexerT &res_indexer_)
+        : lhs(lhs_), rhs(rhs_), res(res_), workspace(workspace_),
+          local_B_block(local_B_block_), n(n_), n_blocks(n_blocks_),
+          delta_n(delta_n_), k(k_), k_blocks(k_blocks_), delta_k(delta_k_),
+          n_wi(n_wi_), m(m_), batch_nelems(batch_nelems_),
+          batch_indexer(batch_indexer_), lhs_indexer(lhs_indexer_),
+          rhs_indexer(rhs_indexer_), res_indexer(res_indexer_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        // for batching:
+        // (current matrix in batch) m_id = global_id / (global_range /
+        // batch_nelems) for lhs, offset = m_id * (n * k) for rhs, offset =
+        // m_id
+        // * (k * m) for res, offset = m_id * (n * m)
+        const std::size_t n_groups_per_batch =
+            it.get_group_range(0) / batch_nelems;
+        const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch;
+        const std::size_t gr_id =
+            it.get_group_linear_id() - m_id * n_groups_per_batch;
+        const std::size_t lid = it.get_local_linear_id();
+
+        const auto &three_offsets_ = batch_indexer(static_cast<ssize_t>(m_id));
+
+        const auto &lhs_offset = three_offsets_.get_first_offset();
+        const auto &rhs_offset = three_offsets_.get_second_offset();
+        const auto &res_offset = three_offsets_.get_third_offset();
+
+        // lift gr_id -> (block_i, block_j, block_s)
+        //   block_i moves fastest, then block_s, then block_j
+
+        const std::size_t r_size = (n_blocks * k_blocks);
+        // 0 <= block_j < m_blocks,
+        const std::size_t block_j = gr_id / r_size;
+        // 0 <= block_r < n_blocks * k_blocks
+        const std::size_t block_r = gr_id - block_j * r_size;
+        // 0 <= block_s < k_blocks
+        const std::size_t block_s = block_r / n_blocks;
+        // 0 <= block_i < n_blocks
+        const std::size_t block_i = block_r - block_s * n_blocks;
+
+        // 0 <= local_i < delta_n
+        const std::size_t local_i = lid / (delta_k);
+        // 0 <= local_s < delta_k
+        const std::size_t local_s = lid - local_i * (delta_k);
+
+        std::size_t i = block_i * delta_n + local_i;
+        std::size_t j = m_groups * block_j;
+        std::size_t s = block_s * delta_k * n_wi + local_s;
+
+        using accV_t = typename LocAccT::value_type;
+
+        static constexpr resT identity_ = resT(0);
+        if (local_i == 0) {
+            for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) {
+                const std::size_t sq = s + q;
+                const std::size_t sqmj = sq * m + j;
+
+                if constexpr (m_groups == 1 && std::is_same_v<accV_t, resT>) {
+                    local_B_block[local_s + q] =
+                        (sq < k && j < m)
+                            ? static_cast<resT>(
+                                  rhs[rhs_offset + rhs_indexer(sqmj)])
+                            : identity_;
+                }
+                else {
+                    accV_t local_B_vec;
+#pragma unroll
+                    for (std::size_t vec_idx = 0; vec_idx < m_groups;
+                         ++vec_idx) {
+                        local_B_vec[vec_idx] =
+                            (sq < k && j + vec_idx < m)
+                                ? static_cast<resT>(
+                                      rhs[rhs_offset +
+                                          rhs_indexer(sqmj + vec_idx)])
+                                : identity_;
+                    }
+                    local_B_block[local_s + q] = local_B_vec;
+                }
+            }
+        }
+
+        it.barrier(sycl::access::fence_space::local_space);
+
+        std::size_t t_shift = block_s * delta_k * n_wi;
+        std::size_t global_s_offset = i * k + t_shift;
+
+        accV_t private_sum(identity_);
+        static constexpr accV_t vec_identity_(identity_);
+        for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) {
+            private_sum +=
+                ((i < n) && (t + t_shift < k))
+                    ? (static_cast<resT>(
+                           lhs[lhs_offset + lhs_indexer(global_s_offset + t)]) *
+                       local_B_block[t])
+                    : vec_identity_;
+        }
+
+        std::size_t workspace_i_shift = local_i * delta_k;
+        workspace[workspace_i_shift + local_s] = private_sum;
+
+        it.barrier(sycl::access::fence_space::local_space);
+
+        if (local_s == 0 && i < n) {
+            accV_t local_sum(workspace[workspace_i_shift]);
+            for (std::size_t t = 1; t < delta_k; ++t) {
+                local_sum += workspace[workspace_i_shift + t];
+            }
+
+            sycl::atomic_ref<resT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                aout0(res[res_offset + res_indexer(i * m + j)]);
+
+            if constexpr (m_groups == 1 && std::is_same_v<accV_t, resT>) {
+                aout0 += local_sum;
+            }
+            else {
+                aout0 += local_sum[0];
+
+#pragma unroll
+                for (std::size_t vec_id = 1; vec_id < m_groups; ++vec_id) {
+                    if (j + vec_id < m) {
+                        sycl::atomic_ref<
+                            resT, sycl::memory_order::relaxed,
+                            sycl::memory_scope::device,
+                            sycl::access::address_space::global_space>
+                            aout1(res[res_offset +
+                                      res_indexer(i * m + j + vec_id)]);
+
+                        aout1 += local_sum[vec_id];
+                    }
+                }
+            }
+        }
+    }
+};
+
+template <typename T1, typename T2, typename T3>
+class gemm_init_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, std::size_t>
+class gemm_k_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, std::size_t>
+class gemm_nm_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          std::size_t>
+class gemm_batch_k_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          std::size_t>
+class gemm_batch_nm_krn;
+
+namespace gemm_detail
+{
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename BatchIndexerT,
+          typename LhsIndexerT,
+          typename RhsIndexerT,
+          typename ResIndexerT>
+sycl::event _gemm_k_impl(sycl::queue &exec_q,
+                         const lhsTy *lhs_tp,
+                         const rhsTy *rhs_tp,
+                         resTy *res_tp,
+                         const std::size_t batch_nelems,
+                         const std::size_t n,
+                         const std::size_t k,
+                         const std::size_t m,
+                         const BatchIndexerT &batch_indexer,
+                         const LhsIndexerT &lhs_indexer,
+                         const RhsIndexerT &rhs_indexer,
+                         const ResIndexerT &res_indexer,
+                         const std::vector<sycl::event> &depends)
+{
+    static constexpr std::size_t m_groups = 4;
+    const std::size_t delta_k(4);
+    std::size_t n_wi(64);
+    std::size_t delta_n(32);
+
+    static_assert(std::is_same_v<LhsIndexerT, RhsIndexerT>);
+    static_assert(std::is_same_v<LhsIndexerT, ResIndexerT>);
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
+        local_mem_size, reserved_slm_size, delta_k,
+        n_wi,   // modified by reference
+        delta_n // modified by reference
+    );
+
+    std::size_t n_blocks = (n + delta_n - 1) / delta_n;
+    std::size_t m_blocks = (m + m_groups - 1) / m_groups;
+    std::size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k);
+
+    std::size_t lws = delta_n * delta_k;
+
+    auto gRange =
+        sycl::range<1>(batch_nelems * n_blocks * m_blocks * k_blocks * lws);
+    auto lRange = sycl::range<1>(lws);
+
+    auto ndRange = sycl::nd_range<1>(gRange, lRange);
+
+    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using LocAccT = sycl::local_accessor<sycl::vec<resTy, m_groups>, 1>;
+        LocAccT local_B_block(n_wi * delta_k, cgh);
+        LocAccT workspace(delta_n * delta_k, cgh);
+
+        using KernelName =
+            class gemm_batch_k_krn<lhsTy, rhsTy, resTy, LhsIndexerT,
+                                   BatchIndexerT, m_groups>;
+        cgh.parallel_for<KernelName>(
+            ndRange,
+            GemmBatchFunctorThreadK<lhsTy, rhsTy, resTy, LocAccT, LhsIndexerT,
+                                    BatchIndexerT, m_groups>(
+                lhs_tp, rhs_tp, res_tp, std::move(workspace),
+                std::move(local_B_block), n, n_blocks, delta_n, k, k_blocks,
+                delta_k, n_wi, m, batch_nelems, batch_indexer, lhs_indexer,
+                rhs_indexer, res_indexer));
+    });
+    return gemm_ev;
+}
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename BatchIndexerT,
+          typename LhsIndexerT,
+          typename RhsIndexerT,
+          typename ResIndexerT>
+sycl::event _gemm_small_m_impl(sycl::queue &exec_q,
+                               const lhsTy *lhs_tp,
+                               const rhsTy *rhs_tp,
+                               resTy *res_tp,
+                               const std::size_t batch_nelems,
+                               const std::size_t n,
+                               const std::size_t k,
+                               const std::size_t m,
+                               const BatchIndexerT &batch_indexer,
+                               const LhsIndexerT &lhs_indexer,
+                               const RhsIndexerT &rhs_indexer,
+                               const ResIndexerT &res_indexer,
+                               const std::vector<sycl::event> &depends)
+{
+    static constexpr std::size_t m_groups = 1;
+    const std::size_t delta_k(4);
+    std::size_t n_wi(64);
+    std::size_t delta_n(32);
+
+    static_assert(std::is_same_v<LhsIndexerT, RhsIndexerT>);
+    static_assert(std::is_same_v<LhsIndexerT, ResIndexerT>);
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
+        local_mem_size, reserved_slm_size, delta_k,
+        n_wi,   // modified by reference
+        delta_n // modified by reference
+    );
+
+    std::size_t n_blocks = (n + delta_n - 1) / delta_n;
+    std::size_t m_blocks = (m + m_groups - 1) / m_groups;
+    std::size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k);
+
+    std::size_t lws = delta_n * delta_k;
+
+    auto gRange =
+        sycl::range<1>(batch_nelems * n_blocks * m_blocks * k_blocks * lws);
+    auto lRange = sycl::range<1>(lws);
+
+    auto ndRange = sycl::nd_range<1>(gRange, lRange);
+
+    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using LocAccT = sycl::local_accessor<resTy, 1>;
+        LocAccT local_B_block(n_wi * delta_k, cgh);
+        LocAccT workspace(delta_n * delta_k, cgh);
+
+        using KernelName =
+            class gemm_batch_k_krn<lhsTy, rhsTy, resTy, LhsIndexerT,
+                                   BatchIndexerT, m_groups>;
+        cgh.parallel_for<KernelName>(
+            ndRange,
+            GemmBatchFunctorThreadK<lhsTy, rhsTy, resTy, LocAccT, LhsIndexerT,
+                                    BatchIndexerT, m_groups>(
+                lhs_tp, rhs_tp, res_tp, std::move(workspace),
+                std::move(local_B_block), n, n_blocks, delta_n, k, k_blocks,
+                delta_k, n_wi, m, batch_nelems, batch_indexer, lhs_indexer,
+                rhs_indexer, res_indexer));
+    });
+
+    return gemm_ev;
+}
+
+} // end of namespace gemm_detail
+
+template <typename lhsT,
+          typename rhsT,
+          typename resT,
+          typename LocAccT1,
+          typename LocAccT2,
+          typename BatchDimsIndexerT,
+          typename LhsIndexerT,
+          typename RhsIndexerT,
+          typename ResIndexerT,
+          std::uint32_t wi_delta_n,
+          std::uint32_t wi_delta_m_vecs,
+          std::uint32_t m_vec_size>
+class GemmBatchFunctorThreadNM_vecm
+{
+private:
+    const lhsT *lhs = nullptr;
+    const rhsT *rhs = nullptr;
+    resT *res = nullptr;
+    LocAccT1 local_lhs_block;
+    LocAccT2 local_rhs_block;
+    std::size_t batch_nelems;
+    std::size_t n = 0;
+    std::size_t k = 0;
+    std::size_t m = 0;
+    std::size_t n_groups = 0;
+    std::uint32_t wg_delta_n = 0;
+    std::uint32_t wg_delta_m = 0;
+    std::uint32_t wi_delta_k = 0;
+    BatchDimsIndexerT batch_indexer;
+    LhsIndexerT lhs_indexer;
+    RhsIndexerT rhs_indexer;
+    ResIndexerT res_indexer;
+
+public:
+    /*! @brief */
+    GemmBatchFunctorThreadNM_vecm(const lhsT *lhs_,
+                                  const rhsT *rhs_,
+                                  resT *res_,
+                                  LocAccT1 local_lhs_block_,
+                                  LocAccT2 local_rhs_block_,
+                                  std::size_t batch_nelems_,
+                                  std::size_t n_,
+                                  std::size_t k_,
+                                  std::size_t m_,
+                                  std::size_t n_groups_,
+                                  std::size_t wg_delta_n_,
+                                  std::size_t wg_delta_m_,
+                                  std::size_t wi_delta_k_,
+                                  const BatchDimsIndexerT &batch_indexer_,
+                                  const LhsIndexerT &lhs_indexer_,
+                                  const RhsIndexerT &rhs_indexer_,
+                                  const ResIndexerT &res_indexer_)
+        : lhs(lhs_), rhs(rhs_), res(res_), local_lhs_block(local_lhs_block_),
+          local_rhs_block(local_rhs_block_), batch_nelems(batch_nelems_), n(n_),
+          k(k_), m(m_), n_groups(n_groups_), wg_delta_n(wg_delta_n_),
+          wg_delta_m(wg_delta_m_), wi_delta_k(wi_delta_k_),
+          batch_indexer(batch_indexer_), lhs_indexer(lhs_indexer_),
+          rhs_indexer(rhs_indexer_), res_indexer(res_indexer_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        static constexpr resT zero_(0);
+        static constexpr std::uint32_t wi_total_delta_m =
+            wi_delta_m_vecs * m_vec_size;
+
+        const std::size_t gws_per_batch = it.get_group_range(0) / batch_nelems;
+        const std::size_t batch_id = it.get_group_linear_id() / gws_per_batch;
+        const std::size_t gr_id =
+            it.get_group_linear_id() - batch_id * gws_per_batch;
+
+        const auto &three_offsets_ =
+            batch_indexer(static_cast<ssize_t>(batch_id));
+
+        const auto &lhs_offset = three_offsets_.get_first_offset();
+        const auto &rhs_offset = three_offsets_.get_second_offset();
+        const auto &res_offset = three_offsets_.get_third_offset();
+
+        // 0 <= block_j < m_groups
+        const std::size_t block_j = gr_id / n_groups;
+        // 0 <= block_i < n_groups
+        const std::size_t block_i = gr_id - block_j * n_groups;
+
+        // Assumption: lws == wg_delta_n * wg_delta_m
+        const std::uint32_t lid = it.get_local_linear_id();
+        // 0 <= local_j < (lws / wg_delta_n == wg_delta_m)
+        const std::uint32_t local_j = lid / wg_delta_n;
+        // sub-group lanes map to adjacent local_i
+        const std::uint32_t local_i = lid - local_j * wg_delta_n;
+
+        // Coordinates of the block of C the work-group works on
+        std::size_t i = block_i * wg_delta_n * wi_delta_n;
+        std::size_t j = block_j * wg_delta_m * wi_total_delta_m;
+
+        using slmA_t = typename LocAccT1::value_type;
+        using slmB_t = typename LocAccT2::value_type;
+
+        const std::size_t a_st0 = k;
+        const std::size_t a_st1 = 1;
+
+        const std::size_t b_st0 = m;
+        const std::size_t b_st1 = 1;
+
+        const std::size_t c_st0 = m;
+        const std::size_t c_st1 = 1;
+
+        // allocate/initialize private matrix C
+        // size ( wi_total_delta_n, wi_total_delta_m )
+        static constexpr std::uint32_t C_size = wi_delta_n * wi_delta_m_vecs;
+        std::array<slmB_t, C_size> private_C{slmB_t{zero_}};
+
+        for (std::size_t s = 0; s < k; s += wi_delta_k) {
+            // populate local_lhs_block<resT> ( wg_delta_n * wi_delta_n,
+            // wi_delta_k)
+            for (std::uint32_t vid = lid; vid < local_lhs_block.size();
+                 vid += it.get_local_range()[0]) {
+                // 0 <= v_i < wg_delta_n * wi_delta_n
+                const std::uint32_t v_i = vid / wi_delta_k;
+                // 0 <= v_s < wi_delta_k
+                const std::uint32_t v_s = vid - v_i * wi_delta_k;
+
+                const std::size_t g_i = i + v_i;
+                const std::size_t g_s = s + v_s;
+
+                const std::uint32_t mapped_vid =
+                    wg_delta_n * wi_delta_n * v_s + v_i;
+                local_lhs_block[mapped_vid] =
+                    (g_i < n && g_s < k)
+                        ? static_cast<resT>(
+                              lhs[lhs_offset +
+                                  lhs_indexer(g_i * a_st0 + g_s * a_st1)])
+                        : zero_;
+            }
+
+            // populate local_rhs_block<vec<resT, m_vec_size>> ( wg_delta_m *
+            // wi_delta_m_vecs, wi_delta_k )
+            for (std::uint32_t vid = lid; vid < local_rhs_block.size();
+                 vid += it.get_local_range()[0]) {
+                // 0 <= v_j < wg_delta_m * wi_delta_m_vecs
+                const std::uint32_t v_j = vid / wi_delta_k;
+                // 0 <= v_s < wi_delta_k
+                const std::uint32_t v_s = vid - v_j * wi_delta_k;
+
+                const std::size_t g_j = j + v_j * m_vec_size;
+                const std::size_t g_s = s + v_s;
+                const std::uint32_t mapped_vid =
+                    wg_delta_m * wi_delta_m_vecs * v_s + v_j;
+
+                if constexpr (m_vec_size == 1) {
+                    local_rhs_block[mapped_vid] =
+                        (g_j < m && g_s < k)
+                            ? static_cast<resT>(
+                                  rhs[rhs_offset +
+                                      rhs_indexer(g_s * b_st0 + g_j * b_st1)])
+                            : zero_;
+                }
+                else {
+                    slmB_t vec{};
+#pragma unroll
+                    for (std::uint32_t lane_id = 0; lane_id < m_vec_size;
+                         ++lane_id) {
+                        const std::size_t g_j1 = g_j + lane_id;
+                        vec[lane_id] = (g_j1 < m && g_s < k)
+                                           ? static_cast<resT>(
+                                                 rhs[rhs_offset +
+                                                     rhs_indexer(g_s * b_st0 +
+                                                                 g_j1 * b_st1)])
+                                           : zero_;
+                    };
+
+                    local_rhs_block[mapped_vid] = vec;
+                }
+            }
+
+            it.barrier(sycl::access::fence_space::local_space);
+
+            const std::uint32_t lo_lhs_st_k = (wg_delta_n * wi_delta_n);
+            const std::uint32_t lo_rhs_rk_k = (wg_delta_m * wi_delta_m_vecs);
+            for (std::uint32_t pr_k = 0; pr_k < wi_delta_k; ++pr_k) {
+                std::array<slmA_t, wi_delta_n> pr_lhs{};
+#pragma unroll
+                for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
+                    pr_lhs[pr_i] =
+                        local_lhs_block[pr_k * lo_lhs_st_k +
+                                        (local_i + pr_i * wg_delta_n)];
+                }
+
+                std::array<slmB_t, wi_delta_m_vecs> pr_rhs{};
+#pragma unroll
+                for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j) {
+                    pr_rhs[pr_j] =
+                        local_rhs_block[pr_k * lo_rhs_rk_k +
+                                        (local_j + pr_j * wg_delta_m)];
+                }
+
+#pragma unroll
+                for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
+#pragma unroll
+                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs;
+                         ++pr_j) {
+                        private_C[pr_i * wi_delta_m_vecs + pr_j] +=
+                            pr_lhs[pr_i] * pr_rhs[pr_j];
+                    }
+                }
+            }
+
+            it.barrier(sycl::access::fence_space::local_space);
+        }
+
+        if constexpr (m_vec_size == 1) {
+#pragma unroll
+            for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
+                std::size_t out_i = i + local_i + pr_i * wg_delta_n;
+                if (out_i < n) {
+#pragma unroll
+                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs;
+                         ++pr_j) {
+                        const std::size_t out_j =
+                            j + (local_j + pr_j * wg_delta_m) * m_vec_size;
+                        const std::size_t out_flat_id =
+                            out_i * c_st0 + out_j * c_st1;
+                        if (out_j < m) {
+                            res[res_offset + res_indexer(out_flat_id)] =
+                                private_C[pr_i * wi_delta_m_vecs + pr_j];
+                        }
+                    }
+                }
+            }
+        }
+        else {
+#pragma unroll
+            for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
+                std::size_t out_i = i + local_i + pr_i * wg_delta_n;
+                if (out_i < n) {
+                    // could be unrolled
+                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs;
+                         ++pr_j) {
+                        std::size_t out_j =
+                            j + (local_j + pr_j * wg_delta_m) * m_vec_size;
+#pragma unroll
+                        for (std::uint32_t lane_id = 0; lane_id < m_vec_size;
+                             ++lane_id) {
+                            const std::size_t out_flat_id =
+                                out_i * c_st0 + (out_j + lane_id) * c_st1;
+                            if (out_j + lane_id < m) {
+                                res[res_offset + res_indexer(out_flat_id)] =
+                                    private_C[pr_i * wi_delta_m_vecs + pr_j]
+                                             [lane_id];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+};
+
+struct GemmBatchFunctorThreadNM_vecm_HyperParameters
+{
+private:
+    std::uint32_t wi_delta_n = 2;
+    std::uint32_t wi_delta_m_vecs = 4;
+    std::uint32_t m_vec_size = 1;
+
+public:
+    constexpr GemmBatchFunctorThreadNM_vecm_HyperParameters();
+    constexpr GemmBatchFunctorThreadNM_vecm_HyperParameters(
+        std::uint32_t wi_delta_n_,
+        std::uint32_t wi_delta_m_vecs_,
+        std::uint32_t m_vec_size_)
+        : wi_delta_n(wi_delta_n_), wi_delta_m_vecs(wi_delta_m_vecs_),
+          m_vec_size(m_vec_size_)
+    {
+    }
+
+    constexpr std::uint32_t get_wi_delta_n() const { return wi_delta_n; }
+    constexpr std::uint32_t get_wi_delta_m_vecs() const
+    {
+        return wi_delta_m_vecs;
+    }
+    constexpr std::uint32_t get_m_vec_size() const { return m_vec_size; }
+};
+
+template <typename resT>
+struct GemmBatchFunctorThreadNM_vecm_HyperParametersSelector
+{
+    constexpr GemmBatchFunctorThreadNM_vecm_HyperParametersSelector() {}
+
+    constexpr GemmBatchFunctorThreadNM_vecm_HyperParameters get() const
+    {
+        if constexpr (sizeof(resT) == 1) {
+            // 1 * 8 * 2 * 4 == 64
+            return GemmBatchFunctorThreadNM_vecm_HyperParameters(8, 2, 4);
+        }
+        else if constexpr (sizeof(resT) == 2) {
+            // 2 * 4 * 2 * 4 == 64
+            return GemmBatchFunctorThreadNM_vecm_HyperParameters(4, 2, 4);
+        }
+        else if constexpr (sizeof(resT) == 4) {
+            // 4 * 4 * 1 * 4 == 64
+            return GemmBatchFunctorThreadNM_vecm_HyperParameters(4, 1, 4);
+        }
+        else if constexpr (sizeof(resT) == 8) {
+            // 8 * 2 * 1 * 4 == 64
+            if constexpr (std::is_same_v<resT, std::complex<float>>) {
+                return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 4, 1);
+            }
+            else {
+                return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 1, 4);
+            }
+        }
+        else if constexpr (std::is_same_v<resT, std::complex<double>>) {
+            // 16 * 2 * 2 * 1 == 64
+            return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 2, 1);
+        }
+        else {
+            return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 2, 1);
+        }
+    }
+};
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          std::uint32_t p1,
+          std::uint32_t p2,
+          std::uint32_t p3>
+class gemm_batch_nm_vecm_krn;
+
+namespace gemm_detail
+{
+
+template <typename T, std::uint32_t wi_delta_n, std::uint32_t wi_delta_m>
+std::tuple<std::uint32_t, std::uint32_t>
+    get_wg_delta_m_and_wi_delta_k(const std::size_t slm_byte_size,
+                                  const std::uint32_t wg_delta_n,
+                                  const std::uint32_t suggested_wg_delta_m)
+{
+    std::uint32_t wg_delta_m = suggested_wg_delta_m;
+
+    const std::size_t slm_max_rows =
+        slm_byte_size /
+        ((wg_delta_n * wi_delta_n + wg_delta_m * wi_delta_m) * sizeof(T));
+
+    std::uint32_t wi_delta_k =
+        (slm_max_rows >= 64)
+            ? 64
+            : 32 * static_cast<std::uint32_t>(slm_max_rows / 32);
+
+    for (std::uint32_t it = 0; !wi_delta_k && (it < 4); ++it) {
+        wg_delta_m /= 2;
+
+        const std::size_t slm_max_rows =
+            slm_byte_size /
+            ((wg_delta_n * wi_delta_n + wg_delta_m * wi_delta_m) * sizeof(T));
+
+        wi_delta_k =
+            (slm_max_rows >= 64)
+                ? 64
+                : ((slm_max_rows >= 32)
+                       ? 32
+                       : (slm_max_rows >= 16 ? 16
+                                             : 8 * static_cast<std::uint32_t>(
+                                                       slm_max_rows / 8)));
+    }
+
+    if (!wi_delta_k) {
+        throw std::runtime_error("Insufficient resources");
+    }
+
+    return std::make_tuple(wg_delta_m, wi_delta_k);
+}
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename BatchIndexerT,
+          typename LhsIndexerT,
+          typename RhsIndexerT,
+          typename ResIndexerT>
+sycl::event _gemm_batch_nm_impl(sycl::queue &exec_q,
+                                const lhsTy *lhs_tp,
+                                const rhsTy *rhs_tp,
+                                resTy *res_tp,
+                                const std::size_t batch_nelems,
+                                const std::size_t n,
+                                const std::size_t k,
+                                const std::size_t m,
+                                const BatchIndexerT &batch_indexer,
+                                const LhsIndexerT &lhs_indexer,
+                                const RhsIndexerT &rhs_indexer,
+                                const ResIndexerT &res_indexer,
+                                std::vector<sycl::event> const &depends)
+{
+    static constexpr GemmBatchFunctorThreadNM_vecm_HyperParametersSelector<
+        resTy>
+        selector{};
+    static constexpr auto hyper_params = selector.get();
+
+    static constexpr std::uint32_t wi_delta_n = hyper_params.get_wi_delta_n();
+    static constexpr std::uint32_t wi_delta_m_vecs =
+        hyper_params.get_wi_delta_m_vecs();
+    static constexpr std::uint32_t m_vec_size = hyper_params.get_m_vec_size();
+
+    static constexpr std::uint32_t wi_total_delta_m =
+        wi_delta_m_vecs * m_vec_size;
+
+    using KernelName =
+        class gemm_batch_nm_vecm_krn<lhsTy, rhsTy, resTy, BatchIndexerT,
+                                     LhsIndexerT, RhsIndexerT, ResIndexerT,
+                                     wi_delta_n, wi_delta_m_vecs, m_vec_size>;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = exec_q.get_context();
+    auto const &dev = exec_q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t max_sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    const std::size_t k_wg_sz = krn.template get_info<
+        sycl::info::kernel_device_specific::work_group_size>(dev);
+
+    // Limit work-group size
+    static constexpr std::size_t wg_sz_limit(2048);
+    const std::size_t max_wg_sz = std::min(wg_sz_limit, k_wg_sz);
+
+    const std::uint32_t max_subgroups_per_wg =
+        static_cast<std::uint32_t>(max_wg_sz / max_sg_size);
+
+    const std::size_t reserved_slm_byte_size = 512;
+    const std::size_t slm_byte_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+
+    const std::uint32_t wg_delta_n = max_sg_size;
+    std::uint32_t wg_delta_m = 0;
+    std::uint32_t wi_delta_k = 0;
+
+    std::tie(wg_delta_m, wi_delta_k) =
+        get_wg_delta_m_and_wi_delta_k<resTy, wi_delta_n, wi_total_delta_m>(
+            slm_byte_size - reserved_slm_byte_size, wg_delta_n,
+            max_subgroups_per_wg);
+
+    const std::uint32_t lws = wg_delta_n * wg_delta_m;
+
+    const std::size_t n_groups =
+        (n + wg_delta_n * wi_delta_n - 1) / (wg_delta_n * wi_delta_n);
+    const std::size_t m_groups = (m + wg_delta_m * wi_total_delta_m - 1) /
+                                 (wg_delta_m * wi_total_delta_m);
+
+    const std::size_t gws = lws * batch_nelems * n_groups * m_groups;
+
+    sycl::range<1> lRange(lws);
+    sycl::range<1> gRange(gws);
+    sycl::nd_range<1> ndRange(gRange, lRange);
+
+    using slmB_t =
+        typename std::conditional<m_vec_size == 1, resTy,
+                                  sycl::vec<resTy, m_vec_size>>::type;
+
+    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.use_kernel_bundle(kb);
+
+        using LocAccT1 = sycl::local_accessor<resTy, 1>;
+        LocAccT1 local_A_block(wg_delta_n * wi_delta_n * wi_delta_k, cgh);
+
+        using LocAccT2 = sycl::local_accessor<slmB_t, 1>;
+        LocAccT2 local_B_block(wg_delta_m * wi_delta_m_vecs * wi_delta_k, cgh);
+
+        using Impl_FunctorT = GemmBatchFunctorThreadNM_vecm<
+            lhsTy, rhsTy, resTy, LocAccT1, LocAccT2, BatchIndexerT, LhsIndexerT,
+            RhsIndexerT, ResIndexerT, wi_delta_n, wi_delta_m_vecs, m_vec_size>;
+
+        cgh.parallel_for<KernelName>(
+            ndRange, Impl_FunctorT(
+                         lhs_tp, rhs_tp, res_tp, std::move(local_A_block),
+                         std::move(local_B_block), batch_nelems, n, k, m,
+                         n_groups, wg_delta_n, wg_delta_m, wi_delta_k,
+                         batch_indexer, lhs_indexer, rhs_indexer, res_indexer));
+    });
+    return gemm_ev;
+}
+
+} // namespace gemm_detail
+
+typedef sycl::event (*gemm_impl_fn_ptr_t)(
+    sycl::queue &,
+    const char *,    // lhs
+    const char *,    // rhs
+    char *,          // res
+    std::size_t,     // lhs_outer_nelems (n)
+    std::size_t,     // inner_nelems (k)
+    std::size_t,     // rhs_outer_nelems (m)
+    int,             // inner nd
+    int,             // lhs outer nd
+    const ssize_t *, // lhs shape and strides
+    int,             // rhs outer nd
+    const ssize_t *, // rhs shape and strides
+    int,             // res outer nd
+    const ssize_t *, // res shape and strides
+    std::vector<sycl::event> const &);
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_impl(sycl::queue &exec_q,
+                      const char *lhs_cp,
+                      const char *rhs_cp,
+                      char *res_cp,
+                      std::size_t n,
+                      std::size_t k,
+                      std::size_t m,
+                      int inner_nd,
+                      int lhs_outer_nd,
+                      const ssize_t *lhs_shape_strides,
+                      int rhs_outer_nd,
+                      const ssize_t *rhs_shape_strides,
+                      int res_outer_nd,
+                      const ssize_t *res_shape_strides,
+                      std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    using OuterInnerIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const OuterInnerIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
+                                         lhs_shape_strides);
+    const OuterInnerIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
+                                         rhs_shape_strides);
+    const OuterInnerIndexerT res_indexer(res_outer_nd, 0, res_shape_strides);
+
+    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
+            OuterInnerIndexerT, OuterInnerIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+
+    sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const IndexerT res_indexer(res_outer_nd, 0, res_shape_strides);
+        using InitKernelName = class gemm_init_krn<lhsTy, rhsTy, resTy>;
+        cgh.parallel_for<InitKernelName>(
+            sycl::range<1>(n * m), [=](sycl::id<1> id) {
+                auto res_offset = res_indexer(id[0]);
+                res_tp[res_offset] = resTy(0);
+            });
+    });
+
+    if (k == 0) {
+        return res_init_ev;
+    }
+
+    if ((max_nm < 64)) {
+        if (m < 4) {
+            return gemm_detail::_gemm_small_m_impl<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
+                OuterInnerIndexerT, OuterInnerIndexerT>(
+                exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+                batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+                {res_init_ev});
+        }
+        return gemm_detail::_gemm_k_impl<lhsTy, rhsTy, resTy, BatchIndexerT,
+                                         OuterInnerIndexerT, OuterInnerIndexerT,
+                                         OuterInnerIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            {res_init_ev});
+    }
+
+    return gemm_detail::_gemm_batch_nm_impl<
+        lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
+        OuterInnerIndexerT, OuterInnerIndexerT>(
+        exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+        batch_indexer, lhs_indexer, rhs_indexer, res_indexer, {res_init_ev});
+}
+
+typedef sycl::event (*gemm_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    const char *, // lhs
+    const char *, // rhs
+    char *,       // res
+    std::size_t,  // n
+    std::size_t,  // k
+    std::size_t,  // m
+    std::vector<sycl::event> const &);
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_contig_impl(sycl::queue &exec_q,
+                             const char *lhs_cp,
+                             const char *rhs_cp,
+                             char *res_cp,
+                             std::size_t n,
+                             std::size_t k,
+                             std::size_t m,
+                             std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    using OuterInnerIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr OuterInnerIndexerT lhs_indexer{};
+    static constexpr OuterInnerIndexerT rhs_indexer{};
+    static constexpr OuterInnerIndexerT res_indexer{};
+
+    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
+            OuterInnerIndexerT, OuterInnerIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+
+    sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.fill<resTy>(res_tp, resTy(0), n * m);
+    });
+
+    if (k == 0) {
+        return res_init_ev;
+    }
+
+    if (max_nm < 64) {
+        if (m < 4) {
+            return gemm_detail::_gemm_small_m_impl<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
+                OuterInnerIndexerT, OuterInnerIndexerT>(
+                exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+                batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+                {res_init_ev});
+        }
+        return gemm_detail::_gemm_k_impl<lhsTy, rhsTy, resTy, BatchIndexerT,
+                                         OuterInnerIndexerT, OuterInnerIndexerT,
+                                         OuterInnerIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            {res_init_ev});
+    }
+
+    return gemm_detail::_gemm_batch_nm_impl<
+        lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
+        OuterInnerIndexerT, OuterInnerIndexerT>(
+        exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+        batch_indexer, lhs_indexer, rhs_indexer, res_indexer, {res_init_ev});
+}
+
+template <typename T1, typename T2, typename T3>
+class gemm_batch_init_krn;
+
+typedef sycl::event (*gemm_batch_impl_fn_ptr_t)(
+    sycl::queue &,
+    const char *,    // lhs
+    const char *,    // rhs
+    char *,          // res
+    std::size_t,     // batch nelems
+    std::size_t,     // lhs outer nelems (n)
+    std::size_t,     // inner nelems (k)
+    std::size_t,     // rhs outer nelems (m)
+    int,             // batching nd
+    const ssize_t *, // batch shape strides
+    ssize_t,         // lhs batch offset
+    ssize_t,         // rhs batch offset
+    ssize_t,         // res batch offset
+    int,             // inner dims
+    int,             // lhs outer dims
+    const ssize_t *, // lhs outer and inner shape and strides
+    int,             // rhs outer dims
+    const ssize_t *, // rhs outer and inner shape and strides
+    int,             // res outer dims
+    const ssize_t *, // res outer and inner shape and strides
+    const ssize_t *, // res full shape and strides
+    std::vector<sycl::event> const &);
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_batch_impl(sycl::queue &exec_q,
+                            const char *lhs_cp,
+                            const char *rhs_cp,
+                            char *res_cp,
+                            std::size_t batch_nelems,
+                            std::size_t n,
+                            std::size_t k,
+                            std::size_t m,
+                            int batch_nd,
+                            const ssize_t *batch_shape_strides,
+                            ssize_t lhs_batch_offset,
+                            ssize_t rhs_batch_offset,
+                            ssize_t res_batch_offset,
+                            int inner_nd,
+                            int lhs_outer_nd,
+                            const ssize_t *lhs_outer_inner_shapes_strides,
+                            int rhs_outer_nd,
+                            const ssize_t *rhs_outer_inner_shapes_strides,
+                            int res_outer_nd,
+                            const ssize_t *res_outer_shapes_strides,
+                            const ssize_t *res_shape_strides,
+                            std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
+                                             lhs_outer_inner_shapes_strides);
+    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
+                                             rhs_outer_inner_shapes_strides);
+    const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
+                                             res_outer_shapes_strides);
+    using BatchDimsIndexerT =
+        dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+    const BatchDimsIndexerT batch_indexer(batch_nd, lhs_batch_offset,
+                                          rhs_batch_offset, res_batch_offset,
+                                          batch_shape_strides);
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+
+    sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const IndexerT res_indexer(batch_nd + res_outer_nd, res_batch_offset,
+                                   res_shape_strides);
+        using InitKernelName = class gemm_batch_init_krn<lhsTy, rhsTy, resTy>;
+        cgh.parallel_for<InitKernelName>(
+            sycl::range<1>(n * m * batch_nelems), [=](sycl::id<1> id) {
+                auto res_offset = res_indexer(id[0]);
+                res_tp[res_offset] = resTy(0);
+            });
+    });
+
+    if (k == 0) {
+        return res_init_ev;
+    }
+
+    if (m < 4) {
+        return gemm_detail::_gemm_small_m_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            {res_init_ev});
+    }
+    else if (k > n && k > m) {
+        return gemm_detail::_gemm_k_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            {res_init_ev});
+    }
+    else {
+        return gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            {res_init_ev});
+    }
+}
+
+typedef sycl::event (*gemm_batch_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    const char *, // lhs
+    const char *, // rhs
+    char *,       // res
+    std::size_t,  // batch nelems
+    std::size_t,  // n
+    std::size_t,  // k
+    std::size_t,  // m
+    ssize_t,      // lhs batch offset
+    ssize_t,      // rhs batch offset
+    ssize_t,      // res batch offset
+    std::vector<sycl::event> const &);
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_batch_contig_impl(sycl::queue &exec_q,
+                                   const char *lhs_cp,
+                                   const char *rhs_cp,
+                                   char *res_cp,
+                                   std::size_t batch_nelems,
+                                   std::size_t n,
+                                   std::size_t k,
+                                   std::size_t m,
+                                   ssize_t lhs_batch_offset,
+                                   ssize_t rhs_batch_offset,
+                                   ssize_t res_batch_offset,
+                                   std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp =
+        reinterpret_cast<const lhsTy *>(lhs_cp) + lhs_batch_offset;
+    const rhsTy *rhs_tp =
+        reinterpret_cast<const rhsTy *>(rhs_cp) + rhs_batch_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + res_batch_offset;
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT res_indexer{};
+
+    using dpctl::tensor::offset_utils::Strided1DIndexer;
+    using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+    using BatchDimsIndexerT =
+        ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                     Strided1DIndexer>;
+
+    const BatchDimsIndexerT batch_indexer(
+        Strided1DIndexer{/* size */ batch_nelems,
+                         /* step */ n * k},
+        Strided1DIndexer{/* size */ batch_nelems,
+                         /* step */ k * m},
+        Strided1DIndexer{/* size */ batch_nelems,
+                         /* step */ n * m});
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+
+    sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.fill<resTy>(res_tp, resTy(0), n * m * batch_nelems);
+    });
+
+    if (k == 0) {
+        return res_init_ev;
+    }
+
+    if (max_nm < 64) {
+        if (m < 4) {
+            return gemm_detail::_gemm_small_m_impl<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+                {res_init_ev});
+        }
+        return gemm_detail::_gemm_k_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            {res_init_ev});
+    }
+
+    return gemm_detail::_gemm_batch_nm_impl<
+        lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+        OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+        exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_indexer,
+        lhs_indexer, rhs_indexer, res_indexer, {res_init_ev});
+}
+
+// ========== Gemm Tree
+
+template <typename lhsT,
+          typename rhsT,
+          typename resT,
+          typename LocAccT1,
+          typename LocAccT2,
+          typename OuterInnerDimsIndexerT,
+          typename ResIndexerT,
+          typename BatchDimsIndexerT,
+          int wi_delta_n,
+          int wi_delta_m>
+class GemmBatchNoAtomicFunctorThreadNM
+{
+private:
+    const lhsT *lhs = nullptr;
+    const rhsT *rhs = nullptr;
+    resT *res = nullptr;
+    LocAccT1 local_A_block;
+    LocAccT2 local_B_block;
+    std::size_t n = 0;
+    std::size_t wg_delta_n = 0;
+    std::size_t k = 0;
+    std::size_t k_blocks = 0;
+    std::size_t wi_delta_k = 0;
+    std::size_t m = 0;
+    std::size_t m_blocks = 0;
+    std::size_t wg_delta_m = 0;
+    std::size_t batch_nelems;
+    BatchDimsIndexerT batch_indexer;
+    OuterInnerDimsIndexerT lhs_indexer;
+    OuterInnerDimsIndexerT rhs_indexer;
+    ResIndexerT res_indexer;
+
+public:
+    GemmBatchNoAtomicFunctorThreadNM(const lhsT *lhs_,
+                                     const rhsT *rhs_,
+                                     resT *res_,
+                                     LocAccT1 local_A_block_,
+                                     LocAccT2 local_B_block_,
+                                     std::size_t n_,
+                                     std::size_t wg_delta_n_,
+                                     std::size_t k_,
+                                     std::size_t k_blocks_,
+                                     std::size_t wi_delta_k_,
+                                     std::size_t m_,
+                                     std::size_t m_blocks_,
+                                     std::size_t wg_delta_m_,
+                                     std::size_t batch_nelems_,
+                                     const BatchDimsIndexerT batch_indexer_,
+                                     const OuterInnerDimsIndexerT lhs_indexer_,
+                                     const OuterInnerDimsIndexerT rhs_indexer_,
+                                     const ResIndexerT res_indexer_)
+        : lhs(lhs_), rhs(rhs_), res(res_), local_A_block(local_A_block_),
+          local_B_block(local_B_block_), n(n_), wg_delta_n(wg_delta_n_), k(k_),
+          k_blocks(k_blocks_), wi_delta_k(wi_delta_k_), m(m_),
+          m_blocks(m_blocks_), wg_delta_m(wg_delta_m_),
+          batch_nelems(batch_nelems_), batch_indexer(batch_indexer_),
+          lhs_indexer(lhs_indexer_), rhs_indexer(rhs_indexer_),
+          res_indexer(res_indexer_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t n_groups_per_batch =
+            it.get_group_range(0) / batch_nelems;
+        const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch;
+        const std::size_t gr_id =
+            it.get_group_linear_id() - m_id * n_groups_per_batch;
+
+        const auto &three_offsets_ = batch_indexer(static_cast<ssize_t>(m_id));
+
+        // lift group_id to (block_i, block_j, block_s),
+        //    0 <= block_i < n_blocks, 0 <= block_j < m_blocks, 0 <= block_s
+        //    < k_blocks
+
+        const auto &lhs_offset = three_offsets_.get_first_offset();
+        const auto &rhs_offset = three_offsets_.get_second_offset();
+        const auto &res_offset = three_offsets_.get_third_offset();
+
+        std::size_t block_i = gr_id / (m_blocks * k_blocks);
+        std::size_t block_r = gr_id - block_i * (m_blocks * k_blocks);
+        std::size_t block_j = block_r / k_blocks;
+        std::size_t block_s = block_r - block_j * k_blocks;
+
+        std::size_t lid = it.get_local_linear_id();
+        std::size_t local_i = lid / wg_delta_m; // 0<= local_i < wg_delta_n
+        std::size_t local_j =
+            lid - local_i * wg_delta_m; // 0<= local_j < wg_delta_m
+
+        // load A block and B blocks into SLM
+
+        std::size_t i = block_i * wi_delta_n * wg_delta_n;
+        std::size_t j = block_j * wi_delta_m * wg_delta_m;
+        std::size_t s = block_s * wi_delta_k;
+
+        const std::int64_t a_st0 = k;
+        const std::int64_t a_st1 = 1;
+
+        const std::int64_t b_st0 = m;
+        const std::int64_t b_st1 = 1;
+
+        const std::int64_t c_st0 = m;
+        const std::int64_t c_st1 = 1;
+
+        std::size_t lws = it.get_local_range(0);
+
+        for (std::size_t vid = lid; vid < local_A_block.size(); vid += lws) {
+            std::size_t v_i =
+                vid / wi_delta_k; // 0<= v_i < wg_delta_n * wi_delta_n
+            std::size_t v_s = vid - v_i * wi_delta_k; // 0<= v_s < wi_delta_k
+
+            std::size_t g_i = i + v_i;
+            std::size_t g_s = s + v_s;
+
+            local_A_block[vid] =
+                (g_i < n && g_s < k)
+                    ? static_cast<resT>(
+                          lhs[lhs_offset +
+                              lhs_indexer(g_i * a_st0 + g_s * a_st1)])
+                    : resT(0);
+        }
+
+        using slmB_t = typename LocAccT2::value_type;
+
+        for (std::size_t vid = lid; vid < local_B_block.size(); vid += lws) {
+            std::size_t v_j = vid / wi_delta_k;       // 0<= v_i < wg_delta_m
+            std::size_t v_s = vid - v_j * wi_delta_k; // 0<= v_s < wi_delta_k
+
+            std::size_t g_j = j + v_j * wi_delta_m;
+            std::size_t g_s = s + v_s;
+
+            if constexpr (wi_delta_m == 1 && std::is_same_v<slmB_t, resT>) {
+                local_B_block[vid] =
+                    (g_j < m && g_s < k)
+                        ? static_cast<resT>(
+                              rhs[rhs_offset +
+                                  rhs_indexer(g_s * b_st0 + g_j * b_st1)])
+                        : resT(0);
+            }
+            else {
+                slmB_t vec{};
+#pragma unroll
+                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m;
+                     ++lane_id) {
+                    std::size_t g_j1 = g_j + lane_id;
+                    vec[lane_id] =
+                        (g_j1 < m && g_s < k)
+                            ? static_cast<resT>(
+                                  rhs[rhs_offset +
+                                      rhs_indexer(g_s * b_st0 + g_j1 * b_st1)])
+                            : resT(0);
+                }
+
+                local_B_block[vid] = vec;
+            }
+        }
+
+        it.barrier(sycl::access::fence_space::local_space);
+
+        i += local_i * wi_delta_n;
+        j += local_j * wi_delta_m;
+
+        const std::size_t a_offset = local_i * wi_delta_k * wi_delta_n;
+        const std::size_t b_offset = local_j * wi_delta_k;
+
+        static constexpr resT identity_(0);
+
+        for (std::uint8_t private_i = 0; private_i < wi_delta_n; ++private_i) {
+            const std::size_t a_pr_offset = private_i * wi_delta_k;
+
+            slmB_t local_sum(identity_);
+            for (std::size_t private_s = 0; private_s < wi_delta_k;
+                 ++private_s) {
+                local_sum = local_sum +
+                            (local_A_block[a_offset + a_pr_offset + private_s] *
+                             local_B_block[b_offset + private_s]);
+            }
+
+            const std::size_t gl_i = i + private_i;
+
+            if constexpr (wi_delta_m == 1 && std::is_same_v<slmB_t, resT>) {
+                const std::size_t gl_j = j;
+                if (gl_i < n && gl_j < m) {
+                    res[res_offset + res_indexer(gl_i * c_st0 + gl_j * c_st1) +
+                        (block_s * n * m * batch_nelems)] = local_sum;
+                }
+            }
+            else {
+#pragma unroll
+                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m;
+                     ++lane_id) {
+                    const std::size_t gl_j = j + lane_id;
+
+                    if (gl_i < n && gl_j < m) {
+                        res[res_offset +
+                            res_indexer(gl_i * c_st0 + gl_j * c_st1) +
+                            (block_s * n * m * batch_nelems)] =
+                            local_sum[lane_id];
+                    }
+                }
+            }
+        }
+    }
+};
+
+template <typename lhsT,
+          typename rhsT,
+          typename resT,
+          typename LocAccT,
+          typename OuterInnerDimsIndexerT,
+          typename ResIndexerT,
+          typename BatchDimsIndexerT,
+          std::size_t m_groups>
+class GemmBatchNoAtomicFunctorThreadK
+{
+private:
+    const lhsT *lhs = nullptr;
+    const rhsT *rhs = nullptr;
+    resT *res = nullptr;
+    LocAccT workspace;
+    LocAccT local_B_block;
+    std::size_t n = 0;
+    std::size_t n_blocks = 0;
+    std::size_t delta_n = 0;
+    std::size_t k = 0;
+    std::size_t k_blocks = 0;
+    std::size_t delta_k = 0;
+    std::size_t n_wi = 0;
+    std::size_t m = 0;
+    std::size_t batch_nelems = 0;
+    BatchDimsIndexerT batch_indexer;
+    OuterInnerDimsIndexerT lhs_indexer;
+    OuterInnerDimsIndexerT rhs_indexer;
+    ResIndexerT res_indexer;
+
+public:
+    GemmBatchNoAtomicFunctorThreadK(const lhsT *lhs_,
+                                    const rhsT *rhs_,
+                                    resT *res_,
+                                    LocAccT workspace_,
+                                    LocAccT local_B_block_,
+                                    std::size_t n_,
+                                    std::size_t n_blocks_,
+                                    std::size_t delta_n_,
+                                    std::size_t k_,
+                                    std::size_t k_blocks_,
+                                    std::size_t delta_k_,
+                                    std::size_t n_wi_,
+                                    std::size_t m_,
+                                    std::size_t batch_nelems_,
+                                    const BatchDimsIndexerT &batch_indexer_,
+                                    const OuterInnerDimsIndexerT &lhs_indexer_,
+                                    const OuterInnerDimsIndexerT &rhs_indexer_,
+                                    const ResIndexerT &res_indexer_)
+        : lhs(lhs_), rhs(rhs_), res(res_), workspace(workspace_),
+          local_B_block(local_B_block_), n(n_), n_blocks(n_blocks_),
+          delta_n(delta_n_), k(k_), k_blocks(k_blocks_), delta_k(delta_k_),
+          n_wi(n_wi_), m(m_), batch_nelems(batch_nelems_),
+          batch_indexer(batch_indexer_), lhs_indexer(lhs_indexer_),
+          rhs_indexer(rhs_indexer_), res_indexer(res_indexer_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t n_groups_per_batch =
+            it.get_group_range(0) / batch_nelems;
+        const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch;
+        const std::size_t gr_id =
+            it.get_group_linear_id() - m_id * n_groups_per_batch;
+        std::size_t lid = it.get_local_linear_id();
+
+        const auto &three_offsets_ = batch_indexer(static_cast<ssize_t>(m_id));
+        const auto &lhs_offset = three_offsets_.get_first_offset();
+        const auto &rhs_offset = three_offsets_.get_second_offset();
+        const auto &res_offset = three_offsets_.get_third_offset();
+
+        // lift gr_id -> (block_i, block_j, block_s)
+        //   block_i moves fastest, then block_s, then block_j
+
+        const std::size_t r_size = (n_blocks * k_blocks);
+        // 0 <= block_j < m_blocks
+        std::size_t block_j = gr_id / r_size;
+        // 0 <= block_r < n_blocks * k_blocks
+        std::size_t block_r = gr_id - block_j * r_size;
+        // 0 <= block_s < k_blocks
+        std::size_t block_s = block_r / n_blocks;
+        // 0 <= block_i < n_blocks
+        std::size_t block_i = block_r - block_s * n_blocks;
+
+        std::size_t local_i = lid / (delta_k); // 0 <= local_i < delta_n
+        std::size_t local_s =
+            lid - local_i * (delta_k); // 0 <= local_s < delta_k
+
+        std::size_t i = block_i * delta_n + local_i;
+        std::size_t j = m_groups * block_j;
+        std::size_t s = block_s * delta_k * n_wi + local_s;
+
+        using accV_t = typename LocAccT::value_type;
+
+        static constexpr resT identity_ = resT(0);
+        if (local_i == 0) {
+            for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) {
+                std::size_t sq = s + q;
+                std::size_t sqmj = sq * m + j;
+
+                if constexpr (m_groups == 1 && std::is_same_v<accV_t, resT>) {
+                    local_B_block[local_s + q] =
+                        (sq < k && j < m)
+                            ? static_cast<resT>(
+                                  rhs[rhs_offset + rhs_indexer(sqmj)])
+                            : identity_;
+                }
+                else {
+                    accV_t local_B_vec;
+#pragma unroll
+                    for (std::size_t vec_idx = 0; vec_idx < m_groups;
+                         ++vec_idx) {
+                        local_B_vec[vec_idx] =
+                            (sq < k && j + vec_idx < m)
+                                ? static_cast<resT>(
+                                      rhs[rhs_offset +
+                                          rhs_indexer(sqmj + vec_idx)])
+                                : identity_;
+                    }
+                    local_B_block[local_s + q] = local_B_vec;
+                }
+            }
+        }
+
+        it.barrier(sycl::access::fence_space::local_space);
+
+        std::size_t t_shift = block_s * delta_k * n_wi;
+        std::size_t global_s_offset = i * k + t_shift;
+
+        accV_t private_sum(identity_);
+        static constexpr accV_t vec_identity_(identity_);
+        for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) {
+            private_sum +=
+                ((i < n) && (t + t_shift < k))
+                    ? (static_cast<resT>(
+                           lhs[lhs_offset + lhs_indexer(global_s_offset + t)]) *
+                       local_B_block[t])
+                    : vec_identity_;
+        }
+
+        std::size_t workspace_i_shift = local_i * delta_k;
+        workspace[workspace_i_shift + local_s] = private_sum;
+
+        it.barrier(sycl::access::fence_space::local_space);
+
+        if (local_s == 0 && i < n) {
+            accV_t local_sum(workspace[workspace_i_shift]);
+            for (std::size_t t = 1; t < delta_k; ++t) {
+                local_sum += workspace[workspace_i_shift + t];
+            }
+
+            const std::size_t total_offset =
+                res_offset + (block_s * n * m * batch_nelems);
+
+            if constexpr (m_groups == 1 && std::is_same_v<accV_t, resT>) {
+                res[total_offset + res_indexer(i * m + j)] = local_sum;
+            }
+            else {
+                res[total_offset + res_indexer(i * m + j)] = local_sum[0];
+
+#pragma unroll
+                for (std::size_t vec_id = 1; vec_id < m_groups; ++vec_id) {
+                    if (j + vec_id < m) {
+                        res[total_offset + res_indexer(i * m + j + vec_id)] =
+                            local_sum[vec_id];
+                    }
+                }
+            }
+        }
+    }
+};
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          std::size_t>
+class gemm_batch_tree_k_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          std::size_t>
+class gemm_batch_tree_nm_krn;
+
+namespace gemm_detail
+{
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename BatchIndexerT,
+          typename LhsIndexerT,
+          typename RhsIndexerT,
+          typename ResIndexerT,
+          std::uint32_t m_groups>
+sycl::event _gemm_tree_k_step(sycl::queue &exec_q,
+                              const lhsTy *lhs_tp,
+                              const rhsTy *rhs_tp,
+                              resTy *res_tp,
+                              const std::size_t batch_nelems,
+                              const std::size_t n,
+                              const std::size_t k,
+                              const std::size_t m,
+                              const std::size_t delta_n,
+                              const std::size_t n_wi,
+                              const std::size_t delta_k,
+                              const BatchIndexerT &batch_indexer,
+                              const LhsIndexerT &lhs_indexer,
+                              const RhsIndexerT &rhs_indexer,
+                              const ResIndexerT &res_indexer,
+                              const std::vector<sycl::event> &depends)
+{
+    static_assert(std::is_same_v<LhsIndexerT, RhsIndexerT>);
+
+    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t n_blocks = (n + delta_n - 1) / delta_n;
+        const std::size_t k_blocks =
+            (k + n_wi * delta_k - 1) / (n_wi * delta_k);
+        const std::size_t m_blocks = (m + m_groups - 1) / m_groups;
+
+        const std::size_t lws = delta_n * delta_k;
+        const std::size_t gws =
+            batch_nelems * n_blocks * m_blocks * k_blocks * lws;
+
+        auto gRange = sycl::range<1>(gws);
+        auto lRange = sycl::range<1>(lws);
+        auto ndRange = sycl::nd_range<1>(gRange, lRange);
+
+        using slmB_t =
+            typename std::conditional<m_groups == 1, resTy,
+                                      sycl::vec<resTy, m_groups>>::type;
+
+        using LocAccT = sycl::local_accessor<slmB_t, 1>;
+        LocAccT local_B_block(n_wi * delta_k, cgh);
+        LocAccT workspace(delta_n * delta_k, cgh);
+
+        using KernelName =
+            class gemm_batch_tree_k_krn<lhsTy, rhsTy, resTy, LhsIndexerT,
+                                        ResIndexerT, BatchIndexerT, m_groups>;
+
+        cgh.parallel_for<KernelName>(
+            ndRange,
+            GemmBatchNoAtomicFunctorThreadK<lhsTy, rhsTy, resTy, LocAccT,
+                                            LhsIndexerT, ResIndexerT,
+                                            BatchIndexerT, m_groups>(
+                lhs_tp, rhs_tp, res_tp, std::move(workspace),
+                std::move(local_B_block), n, n_blocks, delta_n, k, k_blocks,
+                delta_k, n_wi, m, batch_nelems, batch_indexer, lhs_indexer,
+                rhs_indexer, res_indexer));
+    });
+    return gemm_ev;
+}
+
+} // end of namespace gemm_detail
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          std::uint32_t m_groups>
+sycl::event
+    gemm_batch_tree_k_impl(sycl::queue &exec_q,
+                           const lhsTy *lhs_tp,
+                           const rhsTy *rhs_tp,
+                           resTy *res_tp,
+                           std::size_t batch_nelems,
+                           std::size_t n,
+                           std::size_t k,
+                           std::size_t m,
+                           int batch_nd,
+                           const ssize_t *batch_shape_strides,
+                           ssize_t lhs_batch_offset,
+                           ssize_t rhs_batch_offset,
+                           ssize_t res_batch_offset,
+                           int inner_nd,
+                           int lhs_outer_nd,
+                           const ssize_t *lhs_outer_inner_shapes_strides,
+                           int rhs_outer_nd,
+                           const ssize_t *rhs_outer_inner_shapes_strides,
+                           int res_outer_nd,
+                           const ssize_t *res_outer_shapes_strides,
+                           const ssize_t *res_shape_strides,
+                           std::vector<sycl::event> const &depends)
+{
+    std::size_t delta_k(4);
+    std::size_t n_wi(64);
+    std::size_t delta_n(32);
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
+        local_mem_size, reserved_slm_size, delta_k,
+        n_wi,   // modified by reference
+        delta_n // modified by reference
+    );
+
+    if (k <= (delta_k * n_wi)) {
+        using OuterInnerDimsIndexerT =
+            dpctl::tensor::offset_utils::StridedIndexer;
+        const OuterInnerDimsIndexerT lhs_indexer(
+            inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
+        const OuterInnerDimsIndexerT rhs_indexer(
+            inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
+        const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
+                                                 res_outer_shapes_strides);
+        using BatchDimsIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+        const BatchDimsIndexerT batch_indexer(
+            batch_nd, lhs_batch_offset, rhs_batch_offset, res_batch_offset,
+            batch_shape_strides);
+
+        return gemm_detail::_gemm_tree_k_step<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, delta_n,
+            n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        std::size_t iter_nelems = batch_nelems * n * m;
+        std::size_t reduction_nelems =
+            (k + delta_k * n_wi - 1) / (delta_k * n_wi);
+
+        // more than one work-group is needed, requires a
+        // temporary delta_k * n_wi elements processed along k,
+        // so if more to process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 4;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        // max_max_wg prevents running out of resources on CPU
+        static constexpr std::size_t max_max_wg = 2048;
+        std::size_t max_wg = std::min(
+            max_max_wg,
+            dev.get_info<sycl::info::device::max_work_group_size>() / 2);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+            using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            const OuterInnerDimsIndexerT lhs_indexer(
+                inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
+            const OuterInnerDimsIndexerT rhs_indexer(
+                inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
+            static constexpr TmpIndexerT res_indexer{};
+
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::StridedIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+            using BatchDimsIndexerT = ThreeOffsets_CombinedIndexer<
+                StridedIndexer, UnpackedStridedIndexer, Strided1DIndexer>;
+            const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset,
+                                                   batch_shape_strides);
+            const UnpackedStridedIndexer rhs_batch_indexer(
+                batch_nd, rhs_batch_offset, batch_shape_strides,
+                batch_shape_strides + 2 * batch_nd);
+            const Strided1DIndexer tmp_batch_indexer(
+                /* size   */ batch_nelems,
+                /* step   */ n * m);
+            const BatchDimsIndexerT batch_indexer(
+                lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer);
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, TmpIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m, delta_n,
+                n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                res_indexer, depends);
+
+            sycl::event red_ev = single_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, tmp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, reduction_groups, wg, max_wg,
+                preferred_reductions_per_wi, reductions_per_wi,
+                batch_nd + res_outer_nd, res_batch_offset, res_shape_strides,
+                {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (
+                                  /* temp */ reduction_nelems +
+                                  /* first reduction temp */ reduction_groups);
+
+            // get unique_ptr owning the temporary allocation
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+            // get raw USM pointer
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+            ;
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+            using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            const OuterInnerDimsIndexerT lhs_indexer(
+                inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
+            const OuterInnerDimsIndexerT rhs_indexer(
+                inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
+            static constexpr TmpIndexerT res_indexer{};
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::StridedIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using BatchDimsIndexerT =
+                ThreeOffsets_CombinedIndexer<StridedIndexer, StridedIndexer,
+                                             Strided1DIndexer>;
+            const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset,
+                                                   batch_shape_strides);
+            const StridedIndexer rhs_batch_indexer(
+                batch_nd, rhs_batch_offset, batch_shape_strides + 2 * batch_nd);
+            const Strided1DIndexer tmp_batch_indexer(
+                /* size   */ batch_nelems,
+                /* step   */ n * m);
+            const BatchDimsIndexerT batch_indexer(
+                lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer);
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, TmpIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, batch_nelems, n,
+                k, m, delta_n, n_wi, delta_k, batch_indexer, lhs_indexer,
+                rhs_indexer, res_indexer, depends);
+
+            sycl::event red_ev = tree_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp,
+                identity_val, iter_nelems, reduction_nelems, reduction_groups,
+                wg, max_wg, preferred_reductions_per_wi, reductions_per_wi,
+                batch_nd + res_outer_nd, res_batch_offset, res_shape_strides,
+                {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+namespace gemm_detail
+{
+
+template <typename lhsTy,
+          typename rhsTy,
+          typename resTy,
+          typename BatchIndexerT,
+          typename LhsIndexerT,
+          typename RhsIndexerT,
+          typename ResIndexerT,
+          std::uint32_t wi_delta_n,
+          std::uint32_t wi_delta_m>
+sycl::event _gemm_tree_nm_step(sycl::queue &exec_q,
+                               const lhsTy *lhs_tp,
+                               const rhsTy *rhs_tp,
+                               resTy *res_tp,
+                               const std::size_t batch_nelems,
+                               const std::size_t n,
+                               const std::size_t k,
+                               const std::size_t m,
+                               const std::uint32_t wg_delta_n,
+                               const std::uint32_t wg_delta_m,
+                               const std::uint32_t wi_delta_k,
+                               const BatchIndexerT &batch_indexer,
+                               const LhsIndexerT &lhs_indexer,
+                               const RhsIndexerT &rhs_indexer,
+                               const ResIndexerT &res_indexer,
+                               const std::vector<sycl::event> &depends)
+{
+    static_assert(std::is_same_v<LhsIndexerT, RhsIndexerT>);
+
+    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const std::size_t lws = wg_delta_n * wg_delta_m;
+
+        const std::size_t n_blocks =
+            ((n + wi_delta_n * wg_delta_n - 1) / (wi_delta_n * wg_delta_n));
+        const std::size_t k_blocks = ((k + wi_delta_k - 1) / wi_delta_k);
+        const std::size_t m_blocks =
+            ((m + wi_delta_m * wg_delta_m - 1) / (wi_delta_m * wg_delta_m));
+
+        const std::size_t gws =
+            batch_nelems * n_blocks * m_blocks * k_blocks * lws;
+
+        auto gwsRange = sycl::range<1>(gws);
+        auto lwsRange = sycl::range<1>(lws);
+        auto ndRange = sycl::nd_range<1>(gwsRange, lwsRange);
+
+        using slmB_t =
+            typename std::conditional<wi_delta_m == 1, resTy,
+                                      sycl::vec<resTy, wi_delta_m>>::type;
+        using LocAccT1 = sycl::local_accessor<resTy, 1>;
+        using LocAccT2 = sycl::local_accessor<slmB_t, 1>;
+
+        const sycl::range<1> local_A_size((wi_delta_n * wg_delta_n) *
+                                          wi_delta_k);
+        const sycl::range<1> local_B_size(wi_delta_k * wg_delta_m);
+
+        LocAccT1 local_A_block(local_A_size, cgh);
+        LocAccT2 local_B_block(local_B_size, cgh);
+
+        using KernelName =
+            class gemm_batch_tree_nm_krn<lhsTy, rhsTy, resTy, LhsIndexerT,
+                                         ResIndexerT, BatchIndexerT,
+                                         wi_delta_m>;
+        cgh.parallel_for<KernelName>(
+            ndRange, GemmBatchNoAtomicFunctorThreadNM<
+                         lhsTy, rhsTy, resTy, LocAccT1, LocAccT2, LhsIndexerT,
+                         ResIndexerT, BatchIndexerT, wi_delta_n, wi_delta_m>(
+                         lhs_tp, rhs_tp, res_tp, std::move(local_A_block),
+                         std::move(local_B_block), n, wg_delta_n, k, k_blocks,
+                         wi_delta_k, m, m_blocks, wg_delta_m, batch_nelems,
+                         batch_indexer, lhs_indexer, rhs_indexer, res_indexer));
+    });
+    return gemm_ev;
+}
+
+} // end namespace gemm_detail
+
+template <typename lhsTy, typename rhsTy, typename resTy, int wi_delta_m>
+sycl::event
+    gemm_batch_tree_nm_impl(sycl::queue &exec_q,
+                            const lhsTy *lhs_tp,
+                            const rhsTy *rhs_tp,
+                            resTy *res_tp,
+                            std::size_t batch_nelems,
+                            std::size_t n,
+                            std::size_t k,
+                            std::size_t m,
+                            int batch_nd,
+                            const ssize_t *batch_shape_strides,
+                            ssize_t lhs_batch_offset,
+                            ssize_t rhs_batch_offset,
+                            ssize_t res_batch_offset,
+                            int inner_nd,
+                            int lhs_outer_nd,
+                            const ssize_t *lhs_outer_inner_shapes_strides,
+                            int rhs_outer_nd,
+                            const ssize_t *rhs_outer_inner_shapes_strides,
+                            int res_outer_nd,
+                            const ssize_t *res_outer_shapes_strides,
+                            const ssize_t *res_shape_strides,
+                            std::vector<sycl::event> const &depends)
+{
+    static constexpr int wi_delta_n = 2;
+    std::size_t wg_delta_n(16); // rows of A processed in WG
+    std::size_t wg_delta_m(16); // rows of B processed in WG
+    std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_nm_parameters<resTy, wi_delta_m>(
+        local_mem_size, reserved_slm_size, wi_delta_n,
+        wi_delta_k, // modified by reference
+        wg_delta_n, // modified by reference
+        wg_delta_m  // modified by reference
+    );
+
+    // each group processes delta_k * n_wi
+    // items in a column, so no need for allocating
+    // temp memory if only one group is needed
+    if (k <= wi_delta_k) {
+        using OuterInnerDimsIndexerT =
+            dpctl::tensor::offset_utils::StridedIndexer;
+        const OuterInnerDimsIndexerT lhs_indexer(
+            inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
+        const OuterInnerDimsIndexerT rhs_indexer(
+            inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
+        const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
+                                                 res_outer_shapes_strides);
+        using BatchDimsIndexerT =
+            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+        const BatchDimsIndexerT batch_indexer(
+            batch_nd, lhs_batch_offset, rhs_batch_offset, res_batch_offset,
+            batch_shape_strides);
+
+        return gemm_detail::_gemm_tree_nm_step<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+            wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                        wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
+                        lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+        std::size_t iter_nelems = batch_nelems * n * m;
+        std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
+
+        // more than one work-group is needed, requires a temporary
+        // delta_k * n_wi elements processed along k, so if more to
+        // process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 4;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+            using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            const OuterInnerDimsIndexerT lhs_indexer(
+                inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
+            const OuterInnerDimsIndexerT rhs_indexer(
+                inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
+            static constexpr TmpIndexerT res_indexer{};
+
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::StridedIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+            using BatchDimsIndexerT = ThreeOffsets_CombinedIndexer<
+                StridedIndexer, UnpackedStridedIndexer, Strided1DIndexer>;
+            const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset,
+                                                   batch_shape_strides);
+            const UnpackedStridedIndexer rhs_batch_indexer(
+                batch_nd, rhs_batch_offset, batch_shape_strides,
+                batch_shape_strides + 2 * batch_nd);
+            const Strided1DIndexer tmp_batch_indexer(
+                /* size   */ batch_nelems,
+                /* step   */ n * m);
+            const BatchDimsIndexerT batch_indexer(
+                lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer);
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, TmpIndexerT, wi_delta_n, wi_delta_m>(
+                exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m, wg_delta_n,
+                wg_delta_m, wi_delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                res_indexer, depends);
+
+            sycl::event red_ev = single_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, tmp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, reduction_groups, wg, max_wg,
+                preferred_reductions_per_wi, reductions_per_wi,
+                batch_nd + res_outer_nd, res_batch_offset, res_shape_strides,
+                {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+            ;
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+            using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const OuterInnerDimsIndexerT lhs_indexer(
+                inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
+            const OuterInnerDimsIndexerT rhs_indexer(
+                inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
+            static constexpr TmpIndexerT res_indexer{};
+
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::StridedIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+            using BatchDimsIndexerT = ThreeOffsets_CombinedIndexer<
+                StridedIndexer, UnpackedStridedIndexer, Strided1DIndexer>;
+
+            const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset,
+                                                   batch_shape_strides);
+            const UnpackedStridedIndexer rhs_batch_indexer(
+                batch_nd, rhs_batch_offset, batch_shape_strides,
+                batch_shape_strides + 2 * batch_nd);
+            const Strided1DIndexer tmp_batch_indexer(
+                /* size   */ batch_nelems,
+                /* step   */ n * m);
+            const BatchDimsIndexerT batch_indexer(
+                lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer);
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, TmpIndexerT, wi_delta_n, wi_delta_m>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, batch_nelems, n,
+                k, m, wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
+                lhs_indexer, rhs_indexer, res_indexer, depends);
+
+            sycl::event red_ev = tree_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp,
+                identity_val, iter_nelems, reduction_nelems, reduction_groups,
+                wg, max_wg, preferred_reductions_per_wi, reductions_per_wi,
+                batch_nd + res_outer_nd, res_batch_offset, res_shape_strides,
+                {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_batch_nm_impl(sycl::queue &exec_q,
+                               const lhsTy *lhs_tp,
+                               const rhsTy *rhs_tp,
+                               resTy *res_tp,
+                               std::size_t batch_nelems,
+                               std::size_t n,
+                               std::size_t k,
+                               std::size_t m,
+                               int batch_nd,
+                               const ssize_t *batch_shape_strides,
+                               ssize_t lhs_batch_offset,
+                               ssize_t rhs_batch_offset,
+                               ssize_t res_batch_offset,
+                               int inner_nd,
+                               int lhs_outer_nd,
+                               const ssize_t *lhs_outer_inner_shapes_strides,
+                               int rhs_outer_nd,
+                               const ssize_t *rhs_outer_inner_shapes_strides,
+                               int res_outer_nd,
+                               const ssize_t *res_outer_shapes_strides,
+                               const ssize_t *res_shape_strides,
+                               std::vector<sycl::event> const &depends = {})
+{
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
+                                             lhs_outer_inner_shapes_strides);
+    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
+                                             rhs_outer_inner_shapes_strides);
+    const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
+                                             res_outer_shapes_strides);
+
+    using BatchDimsIndexerT =
+        dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
+    const BatchDimsIndexerT batch_indexer(batch_nd, lhs_batch_offset,
+                                          rhs_batch_offset, res_batch_offset,
+                                          batch_shape_strides);
+
+    sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
+        lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+        OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+        exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_indexer,
+        lhs_indexer, rhs_indexer, res_indexer, depends);
+
+    return gemm_ev;
+}
+
+template <typename T1, typename T2, typename T3>
+class gemm_batch_tree_empty_krn;
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_batch_tree_impl(sycl::queue &exec_q,
+                                 const char *lhs_cp,
+                                 const char *rhs_cp,
+                                 char *res_cp,
+                                 std::size_t batch_nelems,
+                                 std::size_t n,
+                                 std::size_t k,
+                                 std::size_t m,
+                                 int batch_nd,
+                                 const ssize_t *batch_shape_strides,
+                                 ssize_t lhs_batch_offset,
+                                 ssize_t rhs_batch_offset,
+                                 ssize_t res_batch_offset,
+                                 int inner_nd,
+                                 int lhs_outer_nd,
+                                 const ssize_t *lhs_outer_inner_shapes_strides,
+                                 int rhs_outer_nd,
+                                 const ssize_t *rhs_outer_inner_shapes_strides,
+                                 int res_outer_nd,
+                                 const ssize_t *res_outer_shapes_strides,
+                                 const ssize_t *res_shape_strides,
+                                 std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_batch_nm_impl<lhsTy, rhsTy, resTy>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
+            batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
+            res_batch_offset, inner_nd, lhs_outer_nd,
+            lhs_outer_inner_shapes_strides, rhs_outer_nd,
+            rhs_outer_inner_shapes_strides, res_outer_nd,
+            res_outer_shapes_strides, res_shape_strides, depends);
+    }
+
+    if (k == 0) {
+        sycl::event gemm_batch_no_reduction_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+
+                using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+                const IndexerT res_indexer(batch_nd + res_outer_nd,
+                                           res_batch_offset, res_shape_strides);
+                using InitKernelName =
+                    class gemm_batch_tree_empty_krn<lhsTy, rhsTy, resTy>;
+                cgh.parallel_for<InitKernelName>(
+                    sycl::range<1>(n * m * batch_nelems), [=](sycl::id<1> id) {
+                        auto res_offset = res_indexer(id[0]);
+                        res_tp[res_offset] = resTy(0);
+                    });
+            });
+        return gemm_batch_no_reduction_ev;
+    }
+
+    if (max_nm < 64) {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            if (m < 4) {
+                static constexpr std::uint32_t m_groups_one = 1;
+                return gemm_batch_tree_k_impl<lhsTy, rhsTy, resTy,
+                                              m_groups_one>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                    batch_nd, batch_shape_strides, lhs_batch_offset,
+                    rhs_batch_offset, res_batch_offset, inner_nd, lhs_outer_nd,
+                    lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                    rhs_outer_inner_shapes_strides, res_outer_nd,
+                    res_outer_shapes_strides, res_shape_strides, depends);
+            }
+            else {
+                static constexpr std::uint32_t m_groups_four = 4;
+                return gemm_batch_tree_k_impl<lhsTy, rhsTy, resTy,
+                                              m_groups_four>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                    batch_nd, batch_shape_strides, lhs_batch_offset,
+                    rhs_batch_offset, res_batch_offset, inner_nd, lhs_outer_nd,
+                    lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                    rhs_outer_inner_shapes_strides, res_outer_nd,
+                    res_outer_shapes_strides, res_shape_strides, depends);
+            }
+        }
+        else {
+            static constexpr std::uint32_t m_groups_one = 1;
+            return gemm_batch_tree_k_impl<lhsTy, rhsTy, resTy, m_groups_one>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
+                batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
+                res_batch_offset, inner_nd, lhs_outer_nd,
+                lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                rhs_outer_inner_shapes_strides, res_outer_nd,
+                res_outer_shapes_strides, res_shape_strides, depends);
+        }
+    }
+    else { // m > 1, n > k or m > k
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            static constexpr std::uint32_t m_groups_four = 4;
+            return gemm_batch_tree_nm_impl<lhsTy, rhsTy, resTy, m_groups_four>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
+                batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
+                res_batch_offset, inner_nd, lhs_outer_nd,
+                lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                rhs_outer_inner_shapes_strides, res_outer_nd,
+                res_outer_shapes_strides, res_shape_strides, depends);
+        }
+        else { // m > 1, n > k or m > k, resTy complex
+            static constexpr std::uint32_t m_groups_one = 1;
+            return gemm_batch_tree_nm_impl<lhsTy, rhsTy, resTy, m_groups_one>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
+                batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
+                res_batch_offset, inner_nd, lhs_outer_nd,
+                lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                rhs_outer_inner_shapes_strides, res_outer_nd,
+                res_outer_shapes_strides, res_shape_strides, depends);
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy, std::size_t m_groups>
+sycl::event
+    gemm_batch_contig_tree_k_impl(sycl::queue &exec_q,
+                                  const lhsTy *lhs_tp,
+                                  const rhsTy *rhs_tp,
+                                  resTy *res_tp,
+                                  std::size_t batch_nelems,
+                                  std::size_t n,
+                                  std::size_t k,
+                                  std::size_t m,
+                                  std::vector<sycl::event> const &depends)
+{
+    std::size_t delta_k(4);
+    std::size_t n_wi(64);
+    std::size_t delta_n(32);
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
+        local_mem_size, reserved_slm_size, delta_k,
+        n_wi,   // modified by reference
+        delta_n // modified by reference
+    );
+
+    if (k <= (delta_k * n_wi)) {
+        using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+        static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+        static constexpr OuterInnerDimsIndexerT res_indexer{};
+
+        using dpctl::tensor::offset_utils::Strided1DIndexer;
+        using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+        using BatchDimsIndexerT =
+            ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                         Strided1DIndexer>;
+
+        using dpctl::tensor::offset_utils::Strided1DIndexer;
+        const BatchDimsIndexerT batch_indexer(
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ n * k},
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ k * m},
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ n * m});
+
+        return gemm_detail::_gemm_tree_k_step<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, delta_n,
+            n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
+            depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        std::size_t iter_nelems = batch_nelems * n * m;
+        std::size_t reduction_nelems =
+            (k + delta_k * n_wi - 1) / (delta_k * n_wi);
+
+        // more than one work-group is needed, requires a
+        // temporary delta_k * n_wi elements processed along k,
+        // so if more to process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 4;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using BatchDimsIndexerT =
+                ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                             Strided1DIndexer>;
+
+            const BatchDimsIndexerT batch_indexer(
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ n * k},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ k * m},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ n * m});
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m, delta_n,
+                n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                tmp_indexer, depends);
+
+            sycl::event red_ev =
+                single_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, tmp, res_tp, identity_val, iter_nelems,
+                    reduction_nelems, reduction_groups, wg, max_wg,
+                    preferred_reductions_per_wi, reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using BatchDimsIndexerT =
+                ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                             Strided1DIndexer>;
+
+            const BatchDimsIndexerT batch_indexer(
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ n * k},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ k * m},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ n * m});
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, batch_nelems, n,
+                k, m, delta_n, n_wi, delta_k, batch_indexer, lhs_indexer,
+                rhs_indexer, tmp_indexer, depends);
+
+            sycl::event red_ev =
+                tree_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, partially_reduced_tmp, partially_reduced_tmp2,
+                    res_tp, identity_val, iter_nelems, reduction_nelems,
+                    reduction_groups, wg, max_wg, preferred_reductions_per_wi,
+                    reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy, int wi_delta_m>
+sycl::event
+    gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q,
+                                   const lhsTy *lhs_tp,
+                                   const rhsTy *rhs_tp,
+                                   resTy *res_tp,
+                                   std::size_t batch_nelems,
+                                   std::size_t n,
+                                   std::size_t k,
+                                   std::size_t m,
+                                   std::vector<sycl::event> const &depends)
+{
+    static constexpr int wi_delta_n = 2;
+    std::size_t wg_delta_n(16); // rows of A processed in WG
+    std::size_t wg_delta_m(16); // rows of B processed in WG
+    std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_nm_parameters<resTy, wi_delta_m>(
+        local_mem_size, reserved_slm_size, wi_delta_n,
+        wi_delta_k, // modified by reference
+        wg_delta_n, // modified by reference
+        wg_delta_m  // modified by reference
+    );
+
+    // each group processes delta_k * n_wi
+    // items in a column, so no need for allocating
+    // temp memory if only one group is needed
+    if (k <= wi_delta_k) {
+        using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+        static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+        static constexpr OuterInnerDimsIndexerT res_indexer{};
+
+        using dpctl::tensor::offset_utils::Strided1DIndexer;
+        using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+        using BatchDimsIndexerT =
+            ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                         Strided1DIndexer>;
+
+        const BatchDimsIndexerT batch_indexer(
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ n * k},
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ k * m},
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ n * m});
+
+        return gemm_detail::_gemm_tree_nm_step<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+            wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                        wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
+                        lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+        std::size_t iter_nelems = batch_nelems * n * m;
+        std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
+
+        // more than one work-group is needed, requires a temporary
+        // delta_k * n_wi elements processed along k, so if more to
+        // process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 4;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+
+            resTy *tmp = tmp_owner.get();
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
+
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using BatchDimsIndexerT =
+                ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                             Strided1DIndexer>;
+
+            const BatchDimsIndexerT batch_indexer(
+                Strided1DIndexer{/* size */ batch_nelems,
+                                 /* step */ n * k},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ k * m},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ n * m});
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+                wi_delta_m>(exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m,
+                            wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
+                            lhs_indexer, rhs_indexer, tmp_indexer, depends);
+
+            sycl::event red_ev =
+                single_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, tmp, res_tp, identity_val, iter_nelems,
+                    reduction_nelems, reduction_groups, wg, max_wg,
+                    preferred_reductions_per_wi, reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+
+            using OuterInnerDimsIndexerT =
+                dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
+
+            using dpctl::tensor::offset_utils::Strided1DIndexer;
+            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+            using BatchDimsIndexerT =
+                ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                             Strided1DIndexer>;
+
+            const BatchDimsIndexerT batch_indexer(
+                Strided1DIndexer{/* size */ batch_nelems,
+                                 /* step */ n * k},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ k * m},
+                Strided1DIndexer{/* size   */ batch_nelems,
+                                 /* step   */ n * m});
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+                wi_delta_m>(exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
+                            batch_nelems, n, k, m, wg_delta_n, wg_delta_m,
+                            wi_delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                            tmp_indexer, depends);
+
+            sycl::event red_ev =
+                tree_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, partially_reduced_tmp, partially_reduced_tmp2,
+                    res_tp, identity_val, iter_nelems, reduction_nelems,
+                    reduction_groups, wg, max_wg, preferred_reductions_per_wi,
+                    reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_nm_impl(sycl::queue &exec_q,
+                         const lhsTy *lhs_tp,
+                         const rhsTy *rhs_tp,
+                         resTy *res_tp,
+                         std::size_t n,
+                         std::size_t k,
+                         std::size_t m,
+                         int inner_nd,
+                         int lhs_outer_nd,
+                         const ssize_t *lhs_shape_strides,
+                         int rhs_outer_nd,
+                         const ssize_t *rhs_shape_strides,
+                         int res_outer_nd,
+                         const ssize_t *res_shape_strides,
+                         std::vector<sycl::event> const &depends = {})
+{
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
+                                             lhs_shape_strides);
+    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
+                                             rhs_shape_strides);
+    const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
+                                             res_shape_strides);
+
+    using BatchDimsIndexerT =
+        dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchDimsIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
+        lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+        OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+        exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+        batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+
+    return gemm_ev;
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event
+    gemm_batch_nm_contig_impl(sycl::queue &exec_q,
+                              const lhsTy *lhs_tp,
+                              const rhsTy *rhs_tp,
+                              resTy *res_tp,
+                              std::size_t batch_nelems,
+                              std::size_t n,
+                              std::size_t k,
+                              std::size_t m,
+                              std::vector<sycl::event> const &depends = {})
+{
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT res_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+    if (batch_nelems == single_batch_nelems) {
+        using BatchDimsIndexerT =
+            dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+        static constexpr BatchDimsIndexerT batch_indexer{};
+
+        sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+
+        return gemm_ev;
+    }
+    else {
+        using dpctl::tensor::offset_utils::Strided1DIndexer;
+        using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
+        using BatchDimsIndexerT =
+            ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
+                                         Strided1DIndexer>;
+
+        using dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const BatchDimsIndexerT batch_indexer(
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ n * k},
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ k * m},
+            Strided1DIndexer{/* size   */ batch_nelems,
+                             /* step   */ n * m});
+
+        sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
+            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+
+        return gemm_ev;
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event
+    gemm_batch_contig_tree_impl(sycl::queue &exec_q,
+                                const char *lhs_cp,
+                                const char *rhs_cp,
+                                char *res_cp,
+                                std::size_t batch_nelems,
+                                std::size_t n,
+                                std::size_t k,
+                                std::size_t m,
+                                ssize_t lhs_batch_offset,
+                                ssize_t rhs_batch_offset,
+                                ssize_t res_batch_offset,
+                                std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp =
+        reinterpret_cast<const lhsTy *>(lhs_cp) + lhs_batch_offset;
+    const rhsTy *rhs_tp =
+        reinterpret_cast<const rhsTy *>(rhs_cp) + rhs_batch_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + res_batch_offset;
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_batch_nm_contig_impl<lhsTy, rhsTy, resTy>(
+            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends);
+    }
+
+    if (k == 0) {
+        sycl::event gemm_batch_no_reduction_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+                cgh.fill<resTy>(res_tp, resTy(0), n * m * batch_nelems);
+            });
+        return gemm_batch_no_reduction_ev;
+    }
+
+    if (max_nm < 64) {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            if (m < 4) {
+                return gemm_batch_contig_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                    depends);
+            }
+            else {
+                return gemm_batch_contig_tree_k_impl<lhsTy, rhsTy, resTy, 4>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
+                    depends);
+            }
+        }
+        else {
+            return gemm_batch_contig_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends);
+        }
+    }
+    else { // m > 1, n > k or m > k
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            return gemm_batch_contig_tree_nm_impl<lhsTy, rhsTy, resTy, 4>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends);
+        }
+        else { // m > 1, n > k or m > k, resTy complex
+            return gemm_batch_contig_tree_nm_impl<lhsTy, rhsTy, resTy, 1>(
+                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends);
+        }
+    }
+}
+
+// Gemm tree non-batched
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          std::size_t>
+class gemm_tree_nm_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          std::size_t>
+class gemm_tree_k_krn;
+
+template <typename lhsTy, typename rhsTy, typename resTy, std::size_t m_groups>
+sycl::event gemm_tree_k_impl(sycl::queue &exec_q,
+                             const lhsTy *lhs_tp,
+                             const rhsTy *rhs_tp,
+                             resTy *res_tp,
+                             std::size_t n,
+                             std::size_t k,
+                             std::size_t m,
+                             int inner_nd,
+                             int lhs_outer_nd,
+                             const ssize_t *lhs_outer_inner_shapes_strides,
+                             int rhs_outer_nd,
+                             const ssize_t *rhs_outer_inner_shapes_strides,
+                             int res_nd,
+                             const ssize_t *res_shapes_strides,
+                             const std::vector<sycl::event> &depends)
+{
+    std::size_t delta_k(4);
+    std::size_t n_wi(64);
+    std::size_t delta_n(32);
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
+        local_mem_size, reserved_slm_size, delta_k,
+        n_wi,   // modified by reference
+        delta_n // modified by reference
+    );
+
+    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
+                                             lhs_outer_inner_shapes_strides);
+    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
+                                             rhs_outer_inner_shapes_strides);
+
+    sycl::event gemm_ev;
+    if (k <= (delta_k * n_wi)) {
+        const OuterInnerDimsIndexerT res_indexer(res_nd, 0, res_shapes_strides);
+
+        return gemm_detail::_gemm_tree_k_step<
+            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+            res_indexer, depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        std::size_t iter_nelems = n * m;
+        std::size_t reduction_nelems =
+            (k + delta_k * n_wi - 1) / (delta_k * n_wi);
+
+        // more than one work-groups is needed, requires a temporary
+        // delta_k * n_wi elements processed along k, so if more to
+        // process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr ResIndexerT res_indexer{};
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, ResIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n, k, m,
+                delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                res_indexer, depends);
+
+            sycl::event red_ev = single_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, tmp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, reduction_groups, wg, max_wg,
+                preferred_reductions_per_wi, reductions_per_wi, res_nd, 0,
+                res_shapes_strides, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr ResIndexerT res_indexer{};
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, ResIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
+                single_batch_nelems, n, k, m, delta_n, n_wi, delta_k,
+                batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+
+            // tree_reduction_for_gemm returns sycl::event for reduction
+            sycl::event red_ev = tree_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp,
+                identity_val, iter_nelems, reduction_nelems, reduction_groups,
+                wg, max_wg, preferred_reductions_per_wi, reductions_per_wi,
+                res_nd, 0, res_shapes_strides, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy, int wi_delta_m>
+sycl::event gemm_tree_nm_impl(sycl::queue &exec_q,
+                              const lhsTy *lhs_tp,
+                              const rhsTy *rhs_tp,
+                              resTy *res_tp,
+                              std::size_t n,
+                              std::size_t k,
+                              std::size_t m,
+                              int inner_nd,
+                              int lhs_outer_nd,
+                              const ssize_t *lhs_outer_inner_shapes_strides,
+                              int rhs_outer_nd,
+                              const ssize_t *rhs_outer_inner_shapes_strides,
+                              int res_nd,
+                              const ssize_t *res_shapes_strides,
+                              const std::vector<sycl::event> &depends)
+{
+    static constexpr int wi_delta_n = 2;
+    std::size_t wg_delta_n(16); // rows of A processed in WG
+    std::size_t wg_delta_m(16); // rows of B processed in WG
+    std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_nm_parameters<resTy, wi_delta_m>(
+        local_mem_size, reserved_slm_size, wi_delta_n,
+        wi_delta_k, // modified by reference
+        wg_delta_n, // modified by reference
+        wg_delta_m  // modified by reference
+    );
+
+    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
+                                             lhs_outer_inner_shapes_strides);
+    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
+                                             rhs_outer_inner_shapes_strides);
+
+    // each group processes delta_k items in a column,
+    // so no need to allocate temp memory if one group needed
+    if (k <= wi_delta_k) {
+        const OuterInnerDimsIndexerT res_indexer(res_nd, 0, res_shapes_strides);
+
+        return gemm_detail::_gemm_tree_nm_step<
+            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+            wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n,
+                        k, m, wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
+                        lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        std::size_t iter_nelems = n * m;
+        std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
+
+        // more than one work-groups is needed, requires a temporary
+        // wi_delta_k elements processed along k, so if more to
+        // process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr ResIndexerT res_indexer{};
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, ResIndexerT, wi_delta_n, wi_delta_m>(
+                exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n, k, m,
+                wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer, lhs_indexer,
+                rhs_indexer, res_indexer, depends);
+
+            sycl::event red_ev = single_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, tmp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, reduction_groups, wg, max_wg,
+                preferred_reductions_per_wi, reductions_per_wi, res_nd, 0,
+                res_shapes_strides, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            static constexpr ResIndexerT res_indexer{};
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, ResIndexerT, wi_delta_n, wi_delta_m>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
+                single_batch_nelems, n, k, m, wg_delta_n, wg_delta_m,
+                wi_delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                res_indexer, depends);
+
+            sycl::event red_ev = tree_reduction_for_gemm<resTy, ReductionOpT>(
+                exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp,
+                identity_val, iter_nelems, reduction_nelems, reduction_groups,
+                wg, max_wg, preferred_reductions_per_wi, reductions_per_wi,
+                res_nd, 0, res_shapes_strides, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename T1, typename T2, typename T3>
+class gemm_tree_empty_krn;
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_tree_impl(sycl::queue &exec_q,
+                           const char *lhs_cp,
+                           const char *rhs_cp,
+                           char *res_cp,
+                           std::size_t n,
+                           std::size_t k,
+                           std::size_t m,
+                           int inner_nd,
+                           int lhs_outer_nd,
+                           const ssize_t *lhs_outer_inner_shapes_strides,
+                           int rhs_outer_nd,
+                           const ssize_t *rhs_outer_inner_shapes_strides,
+                           int res_nd,
+                           const ssize_t *res_shapes_strides,
+                           std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        return gemm_nm_impl<lhsTy, rhsTy, resTy>(
+            exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd,
+            lhs_outer_inner_shapes_strides, rhs_outer_nd,
+            rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
+            depends);
+    }
+
+    if (k == 0) {
+        sycl::event gemm_no_reduction_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+
+                using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+                const IndexerT res_indexer(res_nd, 0, res_shapes_strides);
+                using InitKernelName =
+                    class gemm_tree_empty_krn<lhsTy, rhsTy, resTy>;
+                cgh.parallel_for<InitKernelName>(
+                    sycl::range<1>(n * m), [=](sycl::id<1> id) {
+                        auto res_offset = res_indexer(id[0]);
+                        res_tp[res_offset] = resTy(0);
+                    });
+            });
+        return gemm_no_reduction_ev;
+    }
+
+    if (max_nm < 64) {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            if (m < 4) {
+                return gemm_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd,
+                    lhs_outer_nd, lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                    rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
+                    depends);
+            }
+            else {
+                return gemm_tree_k_impl<lhsTy, rhsTy, resTy, 4>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd,
+                    lhs_outer_nd, lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                    rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
+                    depends);
+            }
+        }
+        else {
+            return gemm_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
+                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd,
+                lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
+                depends);
+        }
+    }
+    else { // m > 1, n > k or m > k
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            return gemm_tree_nm_impl<lhsTy, rhsTy, resTy, 4>(
+                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd,
+                lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
+                depends);
+        }
+        else {
+            return gemm_tree_nm_impl<lhsTy, rhsTy, resTy, 1>(
+                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd,
+                lhs_outer_inner_shapes_strides, rhs_outer_nd,
+                rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
+                depends);
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy, std::size_t m_groups>
+sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q,
+                                    const lhsTy *lhs_tp,
+                                    const rhsTy *rhs_tp,
+                                    resTy *res_tp,
+                                    std::size_t n,
+                                    std::size_t k,
+                                    std::size_t m,
+                                    std::vector<sycl::event> const &depends)
+{
+    std::size_t delta_k(4);
+    std::size_t n_wi(64);
+    std::size_t delta_n(32);
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
+        local_mem_size, reserved_slm_size, delta_k,
+        n_wi,   // modified by reference
+        delta_n // modified by reference
+    );
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT res_indexer{};
+
+    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    sycl::event gemm_ev;
+    if (k <= (delta_k * n_wi)) {
+        return gemm_detail::_gemm_tree_k_step<
+            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+            res_indexer, depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        std::size_t iter_nelems = n * m;
+        std::size_t reduction_nelems =
+            (k + delta_k * n_wi - 1) / (delta_k * n_wi);
+
+        // more than one work-groups is needed, requires a
+        // temporary delta_k * n_wi elements processed along k,
+        // so if more to process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n, k, m,
+                delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
+                res_indexer, depends);
+
+            sycl::event red_ev =
+                single_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, tmp, res_tp, identity_val, iter_nelems,
+                    reduction_nelems, reduction_groups, wg, max_wg,
+                    preferred_reductions_per_wi, reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
+                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
+                single_batch_nelems, n, k, m, delta_n, n_wi, delta_k,
+                batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
+
+            // tree_reduction_for_gemm_contig returns sycl::event
+            // for reduction
+            sycl::event red_ev =
+                tree_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, partially_reduced_tmp, partially_reduced_tmp2,
+                    res_tp, identity_val, iter_nelems, reduction_nelems,
+                    reduction_groups, wg, max_wg, preferred_reductions_per_wi,
+                    reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy, int wi_delta_m>
+sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q,
+                                     const lhsTy *lhs_tp,
+                                     const rhsTy *rhs_tp,
+                                     resTy *res_tp,
+                                     std::size_t n,
+                                     std::size_t k,
+                                     std::size_t m,
+                                     std::vector<sycl::event> const &depends)
+{
+    static constexpr int wi_delta_n = 2;
+    std::size_t wg_delta_n(16); // rows of A processed in WG
+    std::size_t wg_delta_m(16); // rows of B processed in WG
+    std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
+
+    const sycl::device &dev = exec_q.get_device();
+    const std::size_t local_mem_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+    const std::size_t reserved_slm_size = 512;
+
+    gemm_detail::scale_gemm_nm_parameters<resTy, wi_delta_m>(
+        local_mem_size, reserved_slm_size, wi_delta_n,
+        wi_delta_k, // modified by reference
+        wg_delta_n, // modified by reference
+        wg_delta_m  // modified by reference
+    );
+
+    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
+    static constexpr OuterInnerDimsIndexerT res_indexer{};
+
+    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
+    static constexpr BatchIndexerT batch_indexer{};
+
+    static constexpr std::size_t single_batch_nelems = 1;
+
+    // each group processes delta_k items in a column,
+    // so no need to allocate temp memory if one group needed
+    if (k <= wi_delta_k) {
+
+        return gemm_detail::_gemm_tree_nm_step<
+            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+            wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n,
+                        k, m, wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
+                        lhs_indexer, rhs_indexer, res_indexer, depends);
+    }
+    else {
+        using ReductionOpT =
+            typename std::conditional<std::is_same_v<resTy, bool>,
+                                      sycl::logical_or<resTy>,
+                                      sycl::plus<resTy>>::type;
+        static constexpr resTy identity_val =
+            sycl::known_identity<ReductionOpT, resTy>::value;
+
+        std::size_t iter_nelems = n * m;
+        std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
+
+        // more than one work-groups is needed, requires a temporary
+        // wi_delta_k elements processed along k, so if more to
+        // process use multiple
+        const auto &sg_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi(preferred_reductions_per_wi);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
+
+        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    iter_nelems * reduction_nelems, exec_q);
+            resTy *tmp = tmp_owner.get();
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+                wi_delta_m>(exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n,
+                            k, m, wg_delta_n, wg_delta_m, wi_delta_k,
+                            batch_indexer, lhs_indexer, rhs_indexer,
+                            res_indexer, depends);
+
+            sycl::event red_ev =
+                single_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, tmp, res_tp, identity_val, iter_nelems,
+                    reduction_nelems, reduction_groups, wg, max_wg,
+                    preferred_reductions_per_wi, reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+            return cleanup_host_task_event;
+        }
+        else {
+            assert(reduction_groups > 1);
+
+            const std::size_t tmp_alloc_size =
+                iter_nelems * (/* temp */ reduction_nelems +
+                               /* first reduction temp */ reduction_groups);
+
+            auto tmp_owner =
+                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+                    tmp_alloc_size, exec_q);
+            resTy *partially_reduced_tmp = tmp_owner.get();
+            resTy *partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_nelems * iter_nelems;
+
+            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
+                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
+                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
+                wi_delta_m>(exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
+                            single_batch_nelems, n, k, m, wg_delta_n,
+                            wg_delta_m, wi_delta_k, batch_indexer, lhs_indexer,
+                            rhs_indexer, res_indexer, depends);
+
+            sycl::event red_ev =
+                tree_reduction_for_gemm_contig<resTy, ReductionOpT>(
+                    exec_q, partially_reduced_tmp, partially_reduced_tmp2,
+                    res_tp, identity_val, iter_nelems, reduction_nelems,
+                    reduction_groups, wg, max_wg, preferred_reductions_per_wi,
+                    reductions_per_wi, {gemm_ev});
+
+            sycl::event cleanup_host_task_event =
+                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
+                                                             tmp_owner);
+
+            return cleanup_host_task_event;
+        }
+    }
+}
+
+template <typename lhsTy, typename rhsTy, typename resTy>
+sycl::event gemm_contig_tree_impl(sycl::queue &exec_q,
+                                  const char *lhs_cp,
+                                  const char *rhs_cp,
+                                  char *res_cp,
+                                  std::size_t n,
+                                  std::size_t k,
+                                  std::size_t m,
+                                  std::vector<sycl::event> const &depends = {})
+{
+    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
+    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    const std::size_t min_nm = std::min(n, m);
+    const std::size_t max_nm = std::max(n, m);
+
+    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
+        static constexpr std::size_t single_batch_nelems = 1;
+        return gemm_batch_nm_contig_impl<lhsTy, rhsTy, resTy>(
+            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
+            depends);
+    }
+
+    if (k == 0) {
+        sycl::event gemm_no_reduction_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+                cgh.fill<resTy>(res_tp, resTy(0), n * m);
+            });
+        return gemm_no_reduction_ev;
+    }
+
+    if (max_nm < 64) {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            if (m < 4) {
+                return gemm_contig_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
+            }
+            else {
+                return gemm_contig_tree_k_impl<lhsTy, rhsTy, resTy, 4>(
+                    exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
+            }
+        }
+        else {
+            return gemm_contig_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
+                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
+        }
+    }
+    else { // m > 1, n > k or m > k
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!is_complex<resTy>::value) {
+            return gemm_contig_tree_nm_impl<lhsTy, rhsTy, resTy, 4>(
+                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
+        }
+        else {
+            return gemm_contig_tree_nm_impl<lhsTy, rhsTy, resTy, 1>(
+                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
+        }
+    }
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpnp/tensor/libtensor/include/kernels/reductions.hpp b/dpnp/tensor/libtensor/include/kernels/reductions.hpp
new file mode 100644
index 000000000000..75df2c201968
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/reductions.hpp
@@ -0,0 +1,3313 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor reduction along axis.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/math_utils.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace reduction_detail
+{
+
+inline std::size_t get_work_group_size(const sycl::device &d)
+{
+    // prevents running out of resources on CPU
+    return std::min<std::size_t>(
+        2048, d.get_info<sycl::info::device::max_work_group_size>() / 2);
+}
+
+} // namespace reduction_detail
+
+template <typename ReductionOpT, typename T>
+struct needs_workaround
+{
+    static constexpr bool value =
+        (std::is_same_v<ReductionOpT, sycl::multiplies<T>> &&
+         (std::is_same_v<T, std::int64_t> ||
+          std::is_same_v<T, std::uint64_t>)) ||
+        (__LIBSYCL_MAJOR_VERSION < 7 && std::is_same_v<T, bool> &&
+         std::is_same_v<ReductionOpT, sycl::logical_or<T>>);
+};
+
+template <typename ReductionOpT, typename T>
+struct can_use_reduce_over_group
+{
+    static constexpr bool value =
+        sycl::has_known_identity<ReductionOpT, T>::value &&
+        !needs_workaround<ReductionOpT, T>::value;
+};
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct SequentialReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+
+public:
+    SequentialReduction(const argT *inp,
+                        outT *res,
+                        const ReductionOp &reduction_op,
+                        const outT &identity_val,
+                        const InputOutputIterIndexerT &arg_res_iter_indexer,
+                        const InputRedIndexerT &arg_reduced_dims_indexer,
+                        std::size_t reduction_size)
+        : inp_(inp), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+
+        auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]);
+        const ssize_t &inp_iter_offset =
+            inp_out_iter_offsets_.get_first_offset();
+        const ssize_t &out_iter_offset =
+            inp_out_iter_offsets_.get_second_offset();
+
+        outT red_val(identity_);
+        for (std::size_t m = 0; m < reduction_max_gid_; ++m) {
+            const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m);
+            const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val;
+            if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value) {
+                val = convert_impl<bool, argT>(inp_[inp_offset]);
+            }
+            else {
+                val = convert_impl<outT, argT>(inp_[inp_offset]);
+            }
+            red_val = reduction_op_(red_val, val);
+        }
+
+        out_[out_iter_offset] = red_val;
+    }
+};
+
+/* === Reduction, using sycl::reduce_over_group, and sycl::atomic_ref === */
+
+/*
+  This kernel only works for outT with sizeof(outT) == 4, or sizeof(outT) == 8
+  if the device has aspect atomic64 and only with those supported by
+  sycl::atomic_ref
+*/
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct ReductionOverGroupWithAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    ReductionOverGroupWithAtomicFunctor(
+        const argT *data,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const outT &identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        std::size_t reduction_size,
+        std::size_t iteration_size,
+        std::size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
+            auto inp_reduction_offset =
+                inp_reduced_dims_indexer_(arg_reduce_gid);
+            auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val;
+            if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value) {
+                // handle nans
+                val = convert_impl<bool, argT>(inp_[inp_offset]);
+            }
+            else {
+                val = convert_impl<outT, argT>(inp_[inp_offset]);
+            }
+
+            local_red_val = reduction_op_(local_red_val, val);
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        outT red_val_over_wg;
+        if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value) {
+            red_val_over_wg = static_cast<outT>(
+                sycl::all_of_group(work_group, local_red_val));
+        }
+        else if constexpr (su_ns::IsLogicalOr<outT, ReductionOp>::value) {
+            red_val_over_wg = static_cast<outT>(
+                sycl::any_of_group(work_group, local_red_val));
+        }
+        else {
+            red_val_over_wg = sycl::reduce_over_group(work_group, local_red_val,
+                                                      identity_, reduction_op_);
+        }
+
+        if (work_group.leader()) {
+            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                res_ref(out_[out_iter_offset]);
+            if constexpr (su_ns::IsPlus<outT, ReductionOp>::value) {
+                res_ref += red_val_over_wg;
+            }
+            else if constexpr (su_ns::IsMaximum<outT, ReductionOp>::value) {
+                res_ref.fetch_max(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsMinimum<outT, ReductionOp>::value) {
+                res_ref.fetch_min(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value) {
+                res_ref.fetch_and(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsLogicalOr<outT, ReductionOp>::value) {
+                res_ref.fetch_or(red_val_over_wg);
+            }
+            else {
+                outT read_val = res_ref.load();
+                outT new_val{};
+                do {
+                    new_val = reduction_op_(read_val, red_val_over_wg);
+                } while (!res_ref.compare_exchange_strong(read_val, new_val));
+            }
+        }
+    }
+};
+
+/* === Reduction, using custom_reduce_over_group, and sycl::atomic_ref === */
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT>
+struct CustomReductionOverGroupWithAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    CustomReductionOverGroupWithAtomicFunctor(
+        const argT *data,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const outT &identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        SlmT local_mem,
+        std::size_t reduction_size,
+        std::size_t iteration_size,
+        std::size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        std::size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) {
+            auto inp_reduction_offset =
+                inp_reduced_dims_indexer_(arg_reduce_gid);
+            auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val;
+            if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
+                          su_ns::IsLogicalOr<outT, ReductionOp>::value) {
+                // handle nans
+                val = convert_impl<bool, argT>(inp_[inp_offset]);
+            }
+            else {
+                val = convert_impl<outT, argT>(inp_[inp_offset]);
+            }
+
+            local_red_val = reduction_op_(local_red_val, val);
+        }
+
+        auto work_group = it.get_group();
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                res_ref(out_[out_iter_offset]);
+            // retain these checks in case a reduce_over_group work-around is
+            // needed
+            if constexpr (su_ns::IsSyclPlus<outT, ReductionOp>::value) {
+                res_ref += red_val_over_wg;
+            }
+            else if constexpr (su_ns::IsSyclMaximum<outT, ReductionOp>::value) {
+                res_ref.fetch_max(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsSyclMinimum<outT, ReductionOp>::value) {
+                res_ref.fetch_min(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsSyclLogicalAnd<outT,
+                                                       ReductionOp>::value) {
+                res_ref.fetch_and(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsSyclLogicalOr<outT,
+                                                      ReductionOp>::value) {
+                res_ref.fetch_or(red_val_over_wg);
+            }
+            else {
+                outT read_val = res_ref.load();
+                outT new_val{};
+                do {
+                    new_val = reduction_op_(read_val, red_val_over_wg);
+                } while (!res_ref.compare_exchange_strong(read_val, new_val));
+            }
+        }
+    }
+};
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct ReductionOverGroupNoAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    ReductionOverGroupNoAtomicFunctor(
+        const argT *data,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const outT &identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        std::size_t reduction_size,
+        std::size_t iteration_size,
+        std::size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const std::size_t n_reduction_groups =
+            it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
+            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                using dpctl::tensor::type_utils::convert_impl;
+                outT val;
+                if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
+                              su_ns::IsLogicalOr<outT, ReductionOp>::value) {
+                    // handle nans
+                    val = convert_impl<bool, argT>(inp_[inp_offset]);
+                }
+                else {
+                    val = convert_impl<outT, argT>(inp_[inp_offset]);
+                }
+
+                local_red_val = reduction_op_(local_red_val, val);
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        outT red_val_over_wg;
+        if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value) {
+            red_val_over_wg = sycl::all_of_group(work_group, local_red_val);
+        }
+        else if constexpr (su_ns::IsLogicalOr<outT, ReductionOp>::value) {
+            red_val_over_wg = sycl::any_of_group(work_group, local_red_val);
+        }
+        else {
+            red_val_over_wg = sycl::reduce_over_group(work_group, local_red_val,
+                                                      identity_, reduction_op_);
+        }
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                red_val_over_wg;
+        }
+    }
+};
+
+/* = Reduction, using custom_reduce_over_group and not using atomic_ref*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT>
+struct CustomReductionOverGroupNoAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    CustomReductionOverGroupNoAtomicFunctor(
+        const argT *data,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const outT &identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        SlmT local_mem,
+        std::size_t reduction_size,
+        std::size_t iteration_size,
+        std::size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const std::size_t n_reduction_groups =
+            it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
+            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                using dpctl::tensor::type_utils::convert_impl;
+                outT val;
+                if constexpr (std::is_same_v<ReductionOp,
+                                             sycl::logical_and<outT>> ||
+                              std::is_same_v<ReductionOp,
+                                             sycl::logical_or<outT>>) {
+                    // handle nans
+                    val = convert_impl<bool, argT>(inp_[inp_offset]);
+                }
+                else {
+                    val = convert_impl<outT, argT>(inp_[inp_offset]);
+                }
+
+                local_red_val = reduction_op_(local_red_val, val);
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                red_val_over_wg;
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename InputOutputIterIndexerT,
+          typename ReductionIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5> class kernel_name_token>
+sycl::event
+    sequential_reduction(sycl::queue &exec_q,
+                         const argTy *arg,
+                         resTy *res,
+                         resTy identity_val,
+                         std::size_t iter_nelems,
+                         std::size_t reduction_nelems,
+                         const InputOutputIterIndexerT &in_out_iter_indexer,
+                         const ReductionIndexerT &reduction_indexer,
+                         const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName =
+            class kernel_name_token<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT>;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(iter_nelems),
+            SequentialReduction<argTy, resTy, ReductionOpT,
+                                InputOutputIterIndexerT, ReductionIndexerT>(
+                arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                reduction_indexer, reduction_nelems));
+    });
+
+    return red_ev;
+}
+
+template <typename BasedKernelName>
+class custom_reduction_wrapper;
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename InputOutputIterIndexerT,
+          typename ReductionIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5> class kernel_name_token>
+sycl::event
+    submit_atomic_reduction(sycl::queue &exec_q,
+                            const argTy *arg,
+                            resTy *res,
+                            resTy identity_val,
+                            std::size_t wg,
+                            std::size_t iter_nelems,
+                            std::size_t reduction_nelems,
+                            std::size_t reductions_per_wi,
+                            std::size_t reduction_groups,
+                            const InputOutputIterIndexerT &in_out_iter_indexer,
+                            const ReductionIndexerT &reduction_indexer,
+                            const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg};
+        auto localRange = sycl::range<1>{wg};
+        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
+
+        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
+            using KernelName =
+                class kernel_name_token<argTy, resTy, ReductionOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
+                                                    InputOutputIterIndexerT,
+                                                    ReductionIndexerT>(
+                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                    reduction_indexer, reduction_nelems, iter_nelems,
+                    reductions_per_wi));
+        }
+        else {
+            using SlmT = sycl::local_accessor<resTy, 1>;
+            SlmT local_memory = SlmT(localRange, cgh);
+
+            using KernelName = class custom_reduction_wrapper<
+                kernel_name_token<argTy, resTy, ReductionOpT,
+                                  InputOutputIterIndexerT, ReductionIndexerT>>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                CustomReductionOverGroupWithAtomicFunctor<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, SlmT>(
+                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                    reduction_indexer, local_memory, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        }
+    });
+    return red_ev;
+}
+
+template <typename T1, typename T2, typename T3>
+class reduction_over_group_with_atomics_init_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class reduction_seq_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class reduction_over_group_with_atomics_krn;
+
+typedef sycl::event (*reduction_strided_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+using dpctl::tensor::sycl_utils::choose_workgroup_size;
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_over_group_with_atomics_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const ssize_t *iter_shape_and_strides,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    int red_nd,
+    const ssize_t *reduction_shape_stride,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const ssize_t *const &res_shape = iter_shape_and_strides;
+            const ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                       res_strides);
+            using InitKernelName =
+                class reduction_over_group_with_atomics_init_krn<resTy, argTy,
+                                                                 ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = identity_val;
+                });
+        });
+
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi =
+            (reduction_nelems < preferred_reductions_per_wi * wg)
+                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
+                : preferred_reductions_per_wi;
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+
+        sycl::event comp_ev =
+            submit_atomic_reduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    reduction_over_group_with_atomics_krn>(
+                exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+                reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {res_init_ev});
+
+        return comp_ev;
+    }
+}
+
+// Contig
+
+typedef sycl::event (*reduction_contig_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+/* @brief Reduce rows in a matrix */
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis1_over_group_with_atomics_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                RowsIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const RowsIndexerT rows_indexer{/* size */ iter_nelems,
+                                        /* step */ reduction_nelems};
+        static constexpr NoOpIndexerT result_indexer{};
+        const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
+                                                          result_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi =
+            (reduction_nelems < preferred_reductions_per_wi * wg)
+                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
+                : preferred_reductions_per_wi;
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+
+        sycl::event comp_ev =
+            submit_atomic_reduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    reduction_over_group_with_atomics_krn>(
+                exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+                reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {res_init_ev});
+
+        return comp_ev;
+    }
+}
+
+/* @brief Reduce rows in a matrix */
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of cols in a
+                                  // matrix when reducing over cols)
+    std::size_t reduction_nelems, // size of each reduction  (length of cols,
+                                  // i.e. number of rows)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                          NoOpIndexerT{}};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = ColsIndexerT;
+
+        static constexpr NoOpIndexerT columns_indexer{};
+        static constexpr NoOpIndexerT result_indexer{};
+        const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                          result_indexer};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        static constexpr std::size_t preferred_reductions_per_wi = 8;
+        std::size_t reductions_per_wi =
+            (reduction_nelems < preferred_reductions_per_wi * wg)
+                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
+                : preferred_reductions_per_wi;
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+
+        sycl::event comp_ev =
+            submit_atomic_reduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    reduction_over_group_with_atomics_krn>(
+                exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+                reduction_nelems, reductions_per_wi, reduction_groups,
+                in_out_iter_indexer, reduction_indexer, {res_init_ev});
+
+        return comp_ev;
+    }
+}
+
+/* = Reduction, using sycl::reduce_over_group, but not using atomic_ref = */
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename InputOutputIterIndexerT,
+          typename ReductionIndexerT,
+          template <typename T1,
+                    typename T2,
+                    typename T3,
+                    typename T4,
+                    typename T5> class kernel_name_token>
+sycl::event submit_no_atomic_reduction(
+    sycl::queue &exec_q,
+    const argTy *arg,
+    resTy *res,
+    resTy identity_val,
+    std::size_t wg,
+    std::size_t iter_nelems,
+    std::size_t reduction_nelems,
+    std::size_t reductions_per_wi,
+    std::size_t reduction_groups,
+    const InputOutputIterIndexerT &in_out_iter_indexer,
+    const ReductionIndexerT &reduction_indexer,
+    const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg};
+        auto localRange = sycl::range<1>{wg};
+        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
+
+        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
+            using KernelName =
+                class kernel_name_token<argTy, resTy, ReductionOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                ReductionOverGroupNoAtomicFunctor<argTy, resTy, ReductionOpT,
+                                                  InputOutputIterIndexerT,
+                                                  ReductionIndexerT>(
+                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                    reduction_indexer, reduction_nelems, iter_nelems,
+                    reductions_per_wi));
+        }
+        else {
+            using SlmT = sycl::local_accessor<resTy, 1>;
+            SlmT local_memory = SlmT(localRange, cgh);
+            using KernelName = class custom_reduction_wrapper<
+                kernel_name_token<argTy, resTy, ReductionOpT,
+                                  InputOutputIterIndexerT, ReductionIndexerT>>;
+
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                CustomReductionOverGroupNoAtomicFunctor<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, SlmT>(
+                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
+                    reduction_indexer, local_memory, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        }
+    });
+    return red_ev;
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class reduction_over_group_temps_krn;
+
+typedef sycl::event (*reduction_strided_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T1, typename T2, typename T3>
+class reduction_over_group_temps_empty_krn;
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_over_group_temps_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const ssize_t *iter_shape_and_strides,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    int red_nd,
+    const ssize_t *reduction_shape_stride,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const ssize_t *const &res_shape = iter_shape_and_strides;
+            const ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                       res_strides);
+            using InitKernelName =
+                class reduction_over_group_temps_empty_krn<resTy, argTy,
+                                                           ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = identity_val;
+                });
+        });
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev = submit_no_atomic_reduction<
+            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+            reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+        ;
+
+        sycl::event first_reduction_ev;
+        {
+            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            // Only 2*iter_nd entries describing shape and strides of
+            // iterated dimensions of input array from
+            // iter_shape_and_strides are going to be accessed by
+            // inp_indexer
+            const InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
+                                            iter_shape_and_strides);
+            static constexpr ResIndexerT noop_tmp_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              noop_tmp_indexer};
+            const ReductionIndexerT reduction_indexer{
+                red_nd, reduction_arg_offset, reduction_shape_stride};
+
+            first_reduction_ev = submit_no_atomic_reduction<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, arg_tp, partially_reduced_tmp, identity_val, wg,
+                iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups, in_out_iter_indexer, reduction_indexer,
+                depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            sycl::event partial_reduction_ev;
+            {
+                using InputIndexerT =
+                    dpctl::tensor::offset_utils::Strided1DIndexer;
+                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+                using InputOutputIterIndexerT =
+                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                        InputIndexerT, ResIndexerT>;
+                using ReductionIndexerT =
+                    dpctl::tensor::offset_utils::NoOpIndexer;
+
+                const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                                /* step */ reduction_groups_};
+                static constexpr ResIndexerT res_iter_indexer{};
+
+                const InputOutputIterIndexerT in_out_iter_indexer{
+                    inp_indexer, res_iter_indexer};
+                static constexpr ReductionIndexerT reduction_indexer{};
+
+                partial_reduction_ev = submit_no_atomic_reduction<
+                    resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, reduction_over_group_temps_krn>(
+                    exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+            }
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        const ResIndexerT res_iter_indexer{
+            iter_nd, iter_res_offset,
+            /* shape */ iter_shape_and_strides,
+            /* strides */ iter_shape_and_strides + 2 * iter_nd};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev = submit_no_atomic_reduction<
+            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
+            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis1_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev = submit_no_atomic_reduction<
+            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+            reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    RowsIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            const RowsIndexerT rows_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_nelems};
+            static constexpr NoOpIndexerT noop_tmp_indexer{};
+            const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
+                                                              noop_tmp_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            first_reduction_ev = submit_no_atomic_reduction<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, arg_tp, partially_reduced_tmp, identity_val, wg,
+                iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups, in_out_iter_indexer, reduction_indexer,
+                depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev = submit_no_atomic_reduction<
+                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
+                remaining_reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev = submit_no_atomic_reduction<
+            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
+            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis0_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr resTy identity_val =
+        su_ns::Identity<ReductionOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                          NoOpIndexerT{}};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        sycl::event comp_ev =
+            sequential_reduction<argTy, resTy, ReductionOpT,
+                                 InputOutputIterIndexerT, ReductionIndexerT,
+                                 reduction_seq_krn>(
+                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
+                reduction_nelems, in_out_iter_indexer, reduction_indexer,
+                depends);
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = ColsIndexerT;
+
+        static constexpr NoOpIndexerT columns_indexer{};
+        static constexpr NoOpIndexerT result_indexer{};
+        const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                          result_indexer};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev = submit_no_atomic_reduction<
+            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
+            reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            static constexpr NoOpIndexerT columns_indexer{};
+            static constexpr NoOpIndexerT noop_tmp_indexer{};
+            const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                              noop_tmp_indexer};
+            const ReductionIndexerT reduction_indexer{
+                /* size */ reduction_nelems,
+                /* step */ iter_nelems};
+
+            first_reduction_ev = submit_no_atomic_reduction<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, arg_tp, partially_reduced_tmp, identity_val, wg,
+                iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups, in_out_iter_indexer, reduction_indexer,
+                depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev = submit_no_atomic_reduction<
+                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, reduction_over_group_temps_krn>(
+                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
+                remaining_reduction_nelems, preferred_reductions_per_wi,
+                reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev = submit_no_atomic_reduction<
+            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+            ReductionIndexerT, reduction_over_group_temps_krn>(
+            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
+            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
+            in_out_iter_indexer, reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+// Argmax and Argmin
+
+/* Sequential search reduction */
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct SequentialSearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+
+public:
+    SequentialSearchReduction(
+        const argT *inp,
+        outT *res,
+        const ReductionOp &reduction_op,
+        const argT &identity_val,
+        const IdxReductionOp &idx_reduction_op,
+        const outT &idx_identity_val,
+        const InputOutputIterIndexerT &arg_res_iter_indexer,
+        const InputRedIndexerT &arg_reduced_dims_indexer,
+        std::size_t reduction_size)
+        : inp_(inp), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), idx_reduction_op_(idx_reduction_op),
+          idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+
+        auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]);
+        const ssize_t &inp_iter_offset =
+            inp_out_iter_offsets_.get_first_offset();
+        const ssize_t &out_iter_offset =
+            inp_out_iter_offsets_.get_second_offset();
+
+        argT red_val(identity_);
+        outT idx_val(idx_identity_);
+        for (std::size_t m = 0; m < reduction_max_gid_; ++m) {
+            const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m);
+            const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            argT val = inp_[inp_offset];
+            if (val == red_val) {
+                idx_val = idx_reduction_op_(idx_val, static_cast<outT>(m));
+            }
+            else {
+                if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                    using dpctl::tensor::type_utils::is_complex;
+                    if constexpr (is_complex<argT>::value) {
+                        using dpctl::tensor::math_utils::less_complex;
+                        // less_complex always returns false for NaNs, so check
+                        if (less_complex<argT>(val, red_val) ||
+                            std::isnan(std::real(val)) ||
+                            std::isnan(std::imag(val))) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else if constexpr (std::is_floating_point_v<argT> ||
+                                       std::is_same_v<argT, sycl::half>) {
+                        if (val < red_val || std::isnan(val)) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else {
+                        if (val < red_val) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                }
+                else if constexpr (su_ns::IsMaximum<argT, ReductionOp>::value) {
+                    using dpctl::tensor::type_utils::is_complex;
+                    if constexpr (is_complex<argT>::value) {
+                        using dpctl::tensor::math_utils::greater_complex;
+                        if (greater_complex<argT>(val, red_val) ||
+                            std::isnan(std::real(val)) ||
+                            std::isnan(std::imag(val))) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else if constexpr (std::is_floating_point_v<argT> ||
+                                       std::is_same_v<argT, sycl::half>) {
+                        if (val > red_val || std::isnan(val)) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                    else {
+                        if (val > red_val) {
+                            red_val = val;
+                            idx_val = static_cast<outT>(m);
+                        }
+                    }
+                }
+            }
+        }
+        out_[out_iter_offset] = idx_val;
+    }
+};
+
+/* = Search reduction using reduce_over_group*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          bool First,
+          bool Last>
+struct SearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    argT *vals_ = nullptr;
+    const outT *inds_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    SearchReduction(const argT *data,
+                    argT *vals,
+                    const outT *inds,
+                    outT *res,
+                    const ReductionOp &reduction_op,
+                    const argT &identity_val,
+                    const IdxReductionOp &idx_reduction_op,
+                    const outT &idx_identity_val,
+                    const InputOutputIterIndexerT &arg_res_iter_indexer,
+                    const InputRedIndexerT &arg_reduced_dims_indexer,
+                    std::size_t reduction_size,
+                    std::size_t iteration_size,
+                    std::size_t reduction_size_per_wi)
+        : inp_(data), vals_(vals), inds_(inds), out_(res),
+          reduction_op_(reduction_op), identity_(identity_val),
+          idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const std::size_t n_reduction_groups =
+            it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        argT local_red_val(identity_);
+        outT local_idx(idx_identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
+            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                argT val = inp_[inp_offset];
+                if (val == local_red_val) {
+                    if constexpr (!First) {
+                        local_idx =
+                            idx_reduction_op_(local_idx, inds_[inp_offset]);
+                    }
+                    else {
+                        local_idx = idx_reduction_op_(
+                            local_idx, static_cast<outT>(arg_reduce_gid));
+                    }
+                }
+                else {
+                    if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                        if (val < local_red_val) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                    else if constexpr (su_ns::IsMaximum<argT,
+                                                        ReductionOp>::value) {
+                        if (val > local_red_val) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        argT red_val_over_wg = sycl::reduce_over_group(
+            work_group, local_red_val, identity_, reduction_op_);
+
+        if constexpr (std::is_integral_v<argT>) {
+            local_idx =
+                (red_val_over_wg == local_red_val) ? local_idx : idx_identity_;
+        }
+        else {
+            local_idx =
+                (red_val_over_wg == local_red_val ||
+                 std::isnan(red_val_over_wg) || std::isnan(local_red_val))
+                    ? local_idx
+                    : idx_identity_;
+        }
+        outT idx_over_wg = sycl::reduce_over_group(
+            work_group, local_idx, idx_identity_, idx_reduction_op_);
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            if constexpr (!Last) {
+                // if not the final reduction, write value corresponding to
+                // an index to a temporary
+                vals_[out_iter_offset * n_reduction_groups +
+                      reduction_batch_id] = red_val_over_wg;
+            }
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                idx_over_wg;
+        }
+    }
+};
+
+/* = Search reduction using custom_reduce_over_group*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT,
+          bool First,
+          bool Last>
+struct CustomSearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    argT *vals_ = nullptr;
+    const outT *inds_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    std::size_t reduction_max_gid_ = 0;
+    std::size_t iter_gws_ = 1;
+    std::size_t reductions_per_wi = 16;
+
+public:
+    CustomSearchReduction(const argT *data,
+                          argT *vals,
+                          outT *inds,
+                          outT *res,
+                          const ReductionOp &reduction_op,
+                          const argT &identity_val,
+                          const IdxReductionOp &idx_reduction_op,
+                          const outT &idx_identity_val,
+                          const InputOutputIterIndexerT &arg_res_iter_indexer,
+                          const InputRedIndexerT &arg_reduced_dims_indexer,
+                          SlmT local_mem,
+                          std::size_t reduction_size,
+                          std::size_t iteration_size,
+                          std::size_t reduction_size_per_wi)
+        : inp_(data), vals_(vals), inds_(inds), out_(res),
+          reduction_op_(reduction_op), identity_(identity_val),
+          idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const std::size_t reduction_lid = it.get_local_id(0);
+        const std::size_t wg =
+            it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
+        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const std::size_t n_reduction_groups =
+            it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        argT local_red_val(identity_);
+        outT local_idx(idx_identity_);
+        std::size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
+            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                argT val = inp_[inp_offset];
+                if (val == local_red_val) {
+                    if constexpr (!First) {
+                        local_idx =
+                            idx_reduction_op_(local_idx, inds_[inp_offset]);
+                    }
+                    else {
+                        local_idx = idx_reduction_op_(
+                            local_idx, static_cast<outT>(arg_reduce_gid));
+                    }
+                }
+                else {
+                    if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                        using dpctl::tensor::type_utils::is_complex;
+                        if constexpr (is_complex<argT>::value) {
+                            using dpctl::tensor::math_utils::less_complex;
+                            // less_complex always returns false for NaNs, so
+                            // check
+                            if (less_complex<argT>(val, local_red_val) ||
+                                std::isnan(std::real(val)) ||
+                                std::isnan(std::imag(val))) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else if constexpr (std::is_floating_point_v<argT> ||
+                                           std::is_same_v<argT, sycl::half>) {
+                            if (val < local_red_val || std::isnan(val)) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else {
+                            if (val < local_red_val) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                    }
+                    else if constexpr (su_ns::IsMaximum<argT,
+                                                        ReductionOp>::value) {
+                        using dpctl::tensor::type_utils::is_complex;
+                        if constexpr (is_complex<argT>::value) {
+                            using dpctl::tensor::math_utils::greater_complex;
+                            if (greater_complex<argT>(val, local_red_val) ||
+                                std::isnan(std::real(val)) ||
+                                std::isnan(std::imag(val))) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else if constexpr (std::is_floating_point_v<argT> ||
+                                           std::is_same_v<argT, sycl::half>) {
+                            if (val > local_red_val || std::isnan(val)) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else {
+                            if (val > local_red_val) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        argT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<argT>::value) {
+            // equality does not hold for NaNs, so check here
+            local_idx = (red_val_over_wg == local_red_val ||
+                         std::isnan(std::real(local_red_val)) ||
+                         std::isnan(std::imag(local_red_val)))
+                            ? local_idx
+                            : idx_identity_;
+        }
+        else if constexpr (std::is_floating_point_v<argT> ||
+                           std::is_same_v<argT, sycl::half>) {
+            // equality does not hold for NaNs, so check here
+            local_idx =
+                (red_val_over_wg == local_red_val || std::isnan(local_red_val))
+                    ? local_idx
+                    : idx_identity_;
+        }
+        else {
+            local_idx =
+                red_val_over_wg == local_red_val ? local_idx : idx_identity_;
+        }
+        outT idx_over_wg = sycl::reduce_over_group(
+            work_group, local_idx, idx_identity_, idx_reduction_op_);
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            if constexpr (!Last) {
+                // if not the final reduction, write value corresponding to
+                // an index to a temporary
+                vals_[out_iter_offset * n_reduction_groups +
+                      reduction_batch_id] = red_val_over_wg;
+            }
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                idx_over_wg;
+        }
+    }
+};
+
+typedef sycl::event (*search_strided_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class search_seq_strided_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class search_seq_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          bool b1,
+          bool b2>
+class search_over_group_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          bool b1,
+          bool b2>
+class custom_search_over_group_krn;
+
+template <typename T1, typename T2, typename T3>
+class search_empty_krn;
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT,
+          typename InputOutputIterIndexerT,
+          typename ReductionIndexerT,
+          bool First,
+          bool Last>
+sycl::event
+    submit_search_reduction(sycl::queue &exec_q,
+                            const argTy *arg,
+                            argTy *arg_tmp,
+                            resTy *res_tmp,
+                            resTy *res,
+                            argTy identity_val,
+                            resTy idx_identity_val,
+                            std::size_t wg,
+                            std::size_t iter_nelems,
+                            std::size_t reduction_nelems,
+                            std::size_t reductions_per_wi,
+                            std::size_t reduction_groups,
+                            const InputOutputIterIndexerT &in_out_iter_indexer,
+                            const ReductionIndexerT &reduction_indexer,
+                            const std::vector<sycl::event> &depends)
+{
+    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg};
+        auto localRange = sycl::range<1>{wg};
+        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
+
+        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
+            using KernelName =
+                class search_over_group_krn<argTy, resTy, ReductionOpT,
+                                            IndexOpT, InputOutputIterIndexerT,
+                                            ReductionIndexerT, First, Last>;
+            cgh.parallel_for<KernelName>(
+                ndRange, SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                         InputOutputIterIndexerT,
+                                         ReductionIndexerT, First, Last>(
+                             arg, arg_tmp, res_tmp, res, ReductionOpT(),
+                             identity_val, IndexOpT(), idx_identity_val,
+                             in_out_iter_indexer, reduction_indexer,
+                             reduction_nelems, iter_nelems, reductions_per_wi));
+        }
+        else {
+            using SlmT = sycl::local_accessor<argTy, 1>;
+            SlmT local_memory = SlmT(localRange, cgh);
+            using KernelName = class custom_search_over_group_krn<
+                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
+                ReductionIndexerT, SlmT, First, Last>;
+            cgh.parallel_for<KernelName>(
+                ndRange,
+                CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                      InputOutputIterIndexerT,
+                                      ReductionIndexerT, SlmT, First, Last>(
+                    arg, arg_tmp, res_tmp, res, ReductionOpT(), identity_val,
+                    IndexOpT(), idx_identity_val, in_out_iter_indexer,
+                    reduction_indexer, local_memory, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        }
+    });
+    return red_ev;
+}
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT>
+sycl::event search_over_group_temps_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const ssize_t *iter_shape_and_strides,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    int red_nd,
+    const ssize_t *reduction_shape_stride,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    static constexpr argTy identity_val =
+        su_ns::Identity<ReductionOpT, argTy>::value;
+    static constexpr resTy idx_identity_val =
+        su_ns::Identity<IndexOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const ssize_t *const &res_shape = iter_shape_and_strides;
+            const ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                       res_strides);
+            using InitKernelName =
+                class search_empty_krn<resTy, argTy, ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = idx_identity_val;
+                });
+        });
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<class search_seq_strided_krn<
+                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 4;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
+        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                  reduction_shape_stride};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, true>(
+                exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        auto val_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<argTy>(
+                tmp_alloc_size, exec_q);
+
+        argTy *partially_reduced_vals_tmp = val_tmp_owner.get();
+        argTy *partially_reduced_vals_tmp2 =
+            partially_reduced_vals_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            // Only 2*iter_nd entries describing shape and strides of iterated
+            // dimensions of input array from iter_shape_and_strides are going
+            // to be accessed by inp_indexer
+            const InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
+                                            iter_shape_and_strides);
+            static constexpr ResIndexerT noop_tmp_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              noop_tmp_indexer};
+            const ReductionIndexerT reduction_indexer{
+                red_nd, reduction_arg_offset, reduction_shape_stride};
+
+            first_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, true, false>(
+                    exec_q, arg_tp, partially_reduced_vals_tmp, nullptr,
+                    partially_reduced_tmp, identity_val, idx_identity_val, wg,
+                    iter_nelems, reduction_nelems, reductions_per_wi,
+                    reduction_groups, in_out_iter_indexer, reduction_indexer,
+                    depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+
+        argTy *vals_temp_arg = partially_reduced_vals_tmp;
+        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
+
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, false, false>(
+                    exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                    identity_val, idx_identity_val, wg, iter_nelems,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            std::swap(vals_temp_arg, vals_temp2_arg);
+            dependent_ev = partial_reduction_ev;
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        const ResIndexerT res_iter_indexer{
+            iter_nd, iter_res_offset,
+            /* shape */ iter_shape_and_strides,
+            /* strides */ iter_shape_and_strides + 2 * iter_nd};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    false, true>(
+                exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, remaining_reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+typedef sycl::event (*search_contig_impl_fn_ptr)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT>
+sycl::event search_axis1_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr argTy identity_val =
+        su_ns::Identity<ReductionOpT, argTy>::value;
+    static constexpr resTy idx_identity_val =
+        su_ns::Identity<IndexOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(idx_identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<class search_seq_contig_krn<
+                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIterIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = NoOpIndexerT;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{
+            InputIterIndexerT{/* size */ iter_nelems,
+                              /* step */ reduction_nelems},
+            NoOpIndexerT{}};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, true>(
+                exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        auto val_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<argTy>(
+                tmp_alloc_size, exec_q);
+        argTy *partially_reduced_vals_tmp = val_tmp_owner.get();
+        argTy *partially_reduced_vals_tmp2 =
+            partially_reduced_vals_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using InputIterIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIterIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            const InputOutputIterIndexerT in_out_iter_indexer{
+                InputIterIndexerT{/* size */ iter_nelems,
+                                  /* step */ reduction_nelems},
+                NoOpIndexerT{}};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            first_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, true, false>(
+                    exec_q, arg_tp, partially_reduced_vals_tmp, nullptr,
+                    partially_reduced_tmp, identity_val, idx_identity_val, wg,
+                    iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups, in_out_iter_indexer, reduction_indexer,
+                    depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+
+        argTy *vals_temp_arg = partially_reduced_vals_tmp;
+        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
+
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, false, false>(
+                    exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                    identity_val, idx_identity_val, wg, iter_nelems,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            std::swap(vals_temp_arg, vals_temp2_arg);
+            dependent_ev = partial_reduction_ev;
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    false, true>(
+                exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, remaining_reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT>
+sycl::event search_axis0_over_group_temps_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
+                                  // matrix when reducing over rows)
+    std::size_t reduction_nelems, // size of each reduction  (length of rows,
+                                  // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    static constexpr argTy identity_val =
+        su_ns::Identity<ReductionOpT, argTy>::value;
+    static constexpr resTy idx_identity_val =
+        su_ns::Identity<IndexOpT, resTy>::value;
+
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(idx_identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+
+        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
+                                                          NoOpIndexerT{}};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using KernelName =
+                class search_seq_contig_krn<argTy, resTy, ReductionOpT,
+                                            IndexOpT, InputOutputIterIndexerT,
+                                            ReductionIndexerT>;
+
+            sycl::range<1> iter_range{iter_nelems};
+
+            cgh.parallel_for<KernelName>(
+                iter_range,
+                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
+                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
+                    reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+
+    static constexpr std::size_t preferred_reductions_per_wi = 8;
+    // prevents running out of resources on CPU
+    std::size_t max_wg = reduction_detail::get_work_group_size(d);
+
+    std::size_t reductions_per_wi(preferred_reductions_per_wi);
+    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
+        // Perform reduction using one 1 work-group per iteration,
+        // can output directly to res
+        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                NoOpIndexerT, NoOpIndexerT>;
+        using ReductionIndexerT = ColsIndexerT;
+
+        static constexpr NoOpIndexerT columns_indexer{};
+        static constexpr NoOpIndexerT result_indexer{};
+        const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                          result_indexer};
+        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
+                                                  /* step */ iter_nelems};
+
+        if (iter_nelems == 1) {
+            // increase GPU occupancy
+            wg = max_wg;
+        }
+        reductions_per_wi =
+            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+        std::size_t reduction_groups =
+            (reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event comp_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, true>(
+                exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, depends);
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        std::size_t reduction_groups =
+            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        std::size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
+            (preferred_reductions_per_wi * wg);
+
+        const std::size_t tmp_alloc_size =
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
+        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
+            tmp_alloc_size, exec_q);
+
+        resTy *partially_reduced_tmp = tmp_owner.get();
+        resTy *partially_reduced_tmp2 =
+            partially_reduced_tmp + reduction_groups * iter_nelems;
+
+        auto vals_tmp_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<argTy>(
+                tmp_alloc_size, exec_q);
+        argTy *partially_reduced_vals_tmp = vals_tmp_owner.get();
+        argTy *partially_reduced_vals_tmp2 =
+            partially_reduced_vals_tmp + reduction_groups * iter_nelems;
+
+        sycl::event first_reduction_ev;
+        {
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            static constexpr NoOpIndexerT columns_indexer{};
+            static constexpr NoOpIndexerT result_indexer{};
+            const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                              result_indexer};
+            const ReductionIndexerT reduction_indexer{
+                /* size */ reduction_nelems,
+                /* step */ iter_nelems};
+
+            first_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, true, false>(
+                    exec_q, arg_tp, partially_reduced_vals_tmp, nullptr,
+                    partially_reduced_tmp, identity_val, idx_identity_val, wg,
+                    iter_nelems, reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups, in_out_iter_indexer, reduction_indexer,
+                    depends);
+        }
+
+        std::size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+
+        argTy *vals_temp_arg = partially_reduced_vals_tmp;
+        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
+
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferred_reductions_per_wi * max_wg) {
+            std::size_t reduction_groups_ =
+                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
+                 1) /
+                (preferred_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                            /* step */ reduction_groups_};
+            static constexpr ResIndexerT res_iter_indexer{};
+
+            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                              res_iter_indexer};
+            static constexpr ReductionIndexerT reduction_indexer{};
+
+            sycl::event partial_reduction_ev =
+                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, false, false>(
+                    exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                    identity_val, idx_identity_val, wg, iter_nelems,
+                    remaining_reduction_nelems, preferred_reductions_per_wi,
+                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
+                    {dependent_ev});
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            std::swap(vals_temp_arg, vals_temp2_arg);
+            dependent_ev = partial_reduction_ev;
+        }
+
+        // final reduction to res
+        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+        using InputOutputIterIndexerT =
+            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                InputIndexerT, ResIndexerT>;
+        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        const InputIndexerT inp_indexer{/* size */ iter_nelems,
+                                        /* step */ remaining_reduction_nelems};
+        static constexpr ResIndexerT res_iter_indexer{};
+
+        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                          res_iter_indexer};
+        static constexpr ReductionIndexerT reduction_indexer{};
+
+        wg = max_wg;
+        reductions_per_wi = std::max<std::size_t>(
+            1, (remaining_reduction_nelems + wg - 1) / wg);
+
+        reduction_groups =
+            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+            (reductions_per_wi * wg);
+        assert(reduction_groups == 1);
+
+        sycl::event final_reduction_ev =
+            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    false, true>(
+                exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val,
+                idx_identity_val, wg, iter_nelems, remaining_reduction_nelems,
+                reductions_per_wi, reduction_groups, in_out_iter_indexer,
+                reduction_indexer, {dependent_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {final_reduction_ev}, tmp_owner, vals_tmp_owner);
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpnp/tensor/libtensor/include/kernels/repeat.hpp b/dpnp/tensor/libtensor/include/kernels/repeat.hpp
new file mode 100644
index 000000000000..83a520adb538
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/repeat.hpp
@@ -0,0 +1,460 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor repeating operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "utils/offset_utils.hpp"
+
+namespace dpctl::tensor::kernels::repeat
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename RepIndexer,
+          typename T,
+          typename repT>
+class repeat_by_sequence_kernel;
+
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename RepIndexer,
+          typename T,
+          typename repT>
+class RepeatSequenceFunctor
+{
+private:
+    const T *src = nullptr;
+    T *dst = nullptr;
+    const repT *reps = nullptr;
+    const repT *cumsum = nullptr;
+    std::size_t src_axis_nelems = 1;
+    OrthogIndexer orthog_strider;
+    SrcAxisIndexer src_axis_strider;
+    DstAxisIndexer dst_axis_strider;
+    RepIndexer reps_strider;
+
+public:
+    RepeatSequenceFunctor(const T *src_,
+                          T *dst_,
+                          const repT *reps_,
+                          const repT *cumsum_,
+                          std::size_t src_axis_nelems_,
+                          const OrthogIndexer &orthog_strider_,
+                          const SrcAxisIndexer &src_axis_strider_,
+                          const DstAxisIndexer &dst_axis_strider_,
+                          const RepIndexer &reps_strider_)
+        : src(src_), dst(dst_), reps(reps_), cumsum(cumsum_),
+          src_axis_nelems(src_axis_nelems_), orthog_strider(orthog_strider_),
+          src_axis_strider(src_axis_strider_),
+          dst_axis_strider(dst_axis_strider_), reps_strider(reps_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> idx) const
+    {
+        std::size_t id = idx[0];
+        auto i_orthog = id / src_axis_nelems;
+        auto i_along = id - (i_orthog * src_axis_nelems);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+        auto src_offset = orthog_offsets.get_first_offset();
+        auto dst_offset = orthog_offsets.get_second_offset();
+
+        auto val = src[src_offset + src_axis_strider(i_along)];
+        auto last = cumsum[i_along];
+        auto first = last - reps[reps_strider(i_along)];
+        for (auto i = first; i < last; ++i) {
+            dst[dst_offset + dst_axis_strider(i)] = val;
+        }
+    }
+};
+
+typedef sycl::event (*repeat_by_sequence_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    const char *,
+    const char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename repT>
+sycl::event
+    repeat_by_sequence_impl(sycl::queue &q,
+                            std::size_t orthog_nelems,
+                            std::size_t src_axis_nelems,
+                            const char *src_cp,
+                            char *dst_cp,
+                            const char *reps_cp,
+                            const char *cumsum_cp,
+                            int orthog_nd,
+                            const ssize_t *orthog_src_dst_shape_and_strides,
+                            ssize_t src_offset,
+                            ssize_t dst_offset,
+                            ssize_t src_axis_shape,
+                            ssize_t src_axis_stride,
+                            ssize_t dst_axis_shape,
+                            ssize_t dst_axis_stride,
+                            ssize_t reps_shape,
+                            ssize_t reps_stride,
+                            const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        const repT *reps_tp = reinterpret_cast<const repT *>(reps_cp);
+        const repT *cumsum_tp = reinterpret_cast<const repT *>(cumsum_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        const TwoOffsets_StridedIndexer orthog_indexer{
+            orthog_nd, src_offset, dst_offset,
+            orthog_src_dst_shape_and_strides};
+        // indexers along repeated axis
+        const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape,
+                                                /* step */ src_axis_stride};
+        const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape,
+                                                /* step */ dst_axis_stride};
+        // indexer along reps array
+        const Strided1DIndexer reps_indexer{/* size */ reps_shape,
+                                            /* step */ reps_stride};
+
+        const std::size_t gws = orthog_nelems * src_axis_nelems;
+
+        cgh.parallel_for<repeat_by_sequence_kernel<
+            TwoOffsets_StridedIndexer, Strided1DIndexer, Strided1DIndexer,
+            Strided1DIndexer, T, repT>>(
+            sycl::range<1>(gws),
+            RepeatSequenceFunctor<TwoOffsets_StridedIndexer, Strided1DIndexer,
+                                  Strided1DIndexer, Strided1DIndexer, T, repT>(
+                src_tp, dst_tp, reps_tp, cumsum_tp, src_axis_nelems,
+                orthog_indexer, src_axis_indexer, dst_axis_indexer,
+                reps_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T>
+struct RepeatSequenceFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_sequence_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    const char *,
+    const char *,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename repT>
+sycl::event repeat_by_sequence_1d_impl(sycl::queue &q,
+                                       std::size_t src_nelems,
+                                       const char *src_cp,
+                                       char *dst_cp,
+                                       const char *reps_cp,
+                                       const char *cumsum_cp,
+                                       int src_nd,
+                                       const ssize_t *src_shape_strides,
+                                       ssize_t dst_shape,
+                                       ssize_t dst_stride,
+                                       ssize_t reps_shape,
+                                       ssize_t reps_stride,
+                                       const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        const repT *reps_tp = reinterpret_cast<const repT *>(reps_cp);
+        const repT *cumsum_tp = reinterpret_cast<const repT *>(cumsum_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        static constexpr TwoZeroOffsets_Indexer orthog_indexer{};
+        // indexers along repeated axis
+        const StridedIndexer src_indexer{src_nd, 0, src_shape_strides};
+        const Strided1DIndexer dst_indexer{/* size */ dst_shape,
+                                           /* step */ dst_stride};
+        // indexer along reps array
+        const Strided1DIndexer reps_indexer{/* size */ reps_shape,
+                                            /* step */ reps_stride};
+
+        const std::size_t gws = src_nelems;
+
+        cgh.parallel_for<repeat_by_sequence_kernel<
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer,
+            Strided1DIndexer, T, repT>>(
+            sycl::range<1>(gws),
+            RepeatSequenceFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                  Strided1DIndexer, Strided1DIndexer, T, repT>(
+                src_tp, dst_tp, reps_tp, cumsum_tp, src_nelems, orthog_indexer,
+                src_indexer, dst_indexer, reps_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T>
+struct RepeatSequence1DFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_sequence_1d_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename T>
+class repeat_by_scalar_kernel;
+
+template <typename OrthogIndexer,
+          typename SrcAxisIndexer,
+          typename DstAxisIndexer,
+          typename T>
+class RepeatScalarFunctor
+{
+private:
+    const T *src = nullptr;
+    T *dst = nullptr;
+    ssize_t reps = 1;
+    std::size_t dst_axis_nelems = 0;
+    OrthogIndexer orthog_strider;
+    SrcAxisIndexer src_axis_strider;
+    DstAxisIndexer dst_axis_strider;
+
+public:
+    RepeatScalarFunctor(const T *src_,
+                        T *dst_,
+                        const ssize_t reps_,
+                        std::size_t dst_axis_nelems_,
+                        const OrthogIndexer &orthog_strider_,
+                        const SrcAxisIndexer &src_axis_strider_,
+                        const DstAxisIndexer &dst_axis_strider_)
+        : src(src_), dst(dst_), reps(reps_), dst_axis_nelems(dst_axis_nelems_),
+          orthog_strider(orthog_strider_), src_axis_strider(src_axis_strider_),
+          dst_axis_strider(dst_axis_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> idx) const
+    {
+        std::size_t id = idx[0];
+        auto i_orthog = id / dst_axis_nelems;
+        auto i_along = id - (i_orthog * dst_axis_nelems);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+        auto src_offset = orthog_offsets.get_first_offset();
+        auto dst_offset = orthog_offsets.get_second_offset();
+
+        auto dst_axis_offset = dst_axis_strider(i_along);
+        auto src_axis_offset = src_axis_strider(i_along / reps);
+        dst[dst_offset + dst_axis_offset] = src[src_offset + src_axis_offset];
+    }
+};
+
+typedef sycl::event (*repeat_by_scalar_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    std::size_t,
+    const char *,
+    char *,
+    const ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event repeat_by_scalar_impl(sycl::queue &q,
+                                  std::size_t orthog_nelems,
+                                  std::size_t dst_axis_nelems,
+                                  const char *src_cp,
+                                  char *dst_cp,
+                                  const ssize_t reps,
+                                  int orthog_nd,
+                                  const ssize_t *orthog_shape_and_strides,
+                                  ssize_t src_offset,
+                                  ssize_t dst_offset,
+                                  ssize_t src_axis_shape,
+                                  ssize_t src_axis_stride,
+                                  ssize_t dst_axis_shape,
+                                  ssize_t dst_axis_stride,
+                                  const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        const TwoOffsets_StridedIndexer orthog_indexer{
+            orthog_nd, src_offset, dst_offset, orthog_shape_and_strides};
+        // indexers along repeated axis
+        const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape,
+                                                /* step */ src_axis_stride};
+        const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape,
+                                                /* step */ dst_axis_stride};
+
+        const std::size_t gws = orthog_nelems * dst_axis_nelems;
+
+        cgh.parallel_for<repeat_by_scalar_kernel<
+            TwoOffsets_StridedIndexer, Strided1DIndexer, Strided1DIndexer, T>>(
+            sycl::range<1>(gws),
+            RepeatScalarFunctor<TwoOffsets_StridedIndexer, Strided1DIndexer,
+                                Strided1DIndexer, T>(
+                src_tp, dst_tp, reps, dst_axis_nelems, orthog_indexer,
+                src_axis_indexer, dst_axis_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T>
+struct RepeatScalarFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_scalar_impl<T>;
+        return fn;
+    }
+};
+
+typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    char *,
+    const ssize_t,
+    int,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event repeat_by_scalar_1d_impl(sycl::queue &q,
+                                     std::size_t dst_nelems,
+                                     const char *src_cp,
+                                     char *dst_cp,
+                                     const ssize_t reps,
+                                     int src_nd,
+                                     const ssize_t *src_shape_strides,
+                                     ssize_t dst_shape,
+                                     ssize_t dst_stride,
+                                     const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        static constexpr TwoZeroOffsets_Indexer orthog_indexer{};
+        // indexers along repeated axis
+        const StridedIndexer src_indexer(src_nd, 0, src_shape_strides);
+        const Strided1DIndexer dst_indexer{/* size */ dst_shape,
+                                           /* step */ dst_stride};
+
+        const std::size_t gws = dst_nelems;
+
+        cgh.parallel_for<repeat_by_scalar_kernel<
+            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer, T>>(
+            sycl::range<1>(gws),
+            RepeatScalarFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
+                                Strided1DIndexer, T>(src_tp, dst_tp, reps,
+                                                     dst_nelems, orthog_indexer,
+                                                     src_indexer, dst_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T>
+struct RepeatScalar1DFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_scalar_1d_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::repeat
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/isin.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/isin.hpp
new file mode 100644
index 000000000000..847fa96ecdff
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/isin.hpp
@@ -0,0 +1,245 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor membership operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/rich_comparisons.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+
+template <typename T,
+          typename HayIndexerT,
+          typename NeedlesIndexerT,
+          typename OutIndexerT>
+struct IsinFunctor
+{
+private:
+    bool invert;
+    const T *hay_tp;
+    const T *needles_tp;
+    bool *out_tp;
+    std::size_t hay_nelems;
+    HayIndexerT hay_indexer;
+    NeedlesIndexerT needles_indexer;
+    OutIndexerT out_indexer;
+
+public:
+    IsinFunctor(const bool invert_,
+                const T *hay_,
+                const T *needles_,
+                bool *out_,
+                const std::size_t hay_nelems_,
+                const HayIndexerT &hay_indexer_,
+                const NeedlesIndexerT &needles_indexer_,
+                const OutIndexerT &out_indexer_)
+        : invert(invert_), hay_tp(hay_), needles_tp(needles_), out_tp(out_),
+          hay_nelems(hay_nelems_), hay_indexer(hay_indexer_),
+          needles_indexer(needles_indexer_), out_indexer(out_indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        using Compare =
+            typename dpctl::tensor::rich_comparisons::AscendingSorter<T>::type;
+        static constexpr Compare comp{};
+
+        const std::size_t i = id[0];
+        const T needle_v = needles_tp[needles_indexer(i)];
+
+        // position of the needle_v in the hay array
+        std::size_t pos{};
+
+        static constexpr std::size_t zero(0);
+        // search in hay in left-closed interval, give `pos` such that
+        // hay[pos - 1] < needle_v <= hay[pos]
+
+        // lower_bound returns the first pos such that bool(hay[pos] <
+        // needle_v) is false, i.e. needle_v <= hay[pos]
+        pos = search_sorted_detail::lower_bound_indexed_impl(
+            hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer);
+        bool out = (pos == hay_nelems ? false : hay_tp[pos] == needle_v);
+        out_tp[out_indexer(i)] = (invert) ? !out : out;
+    }
+};
+
+typedef sycl::event (*isin_contig_impl_fp_ptr_t)(
+    sycl::queue &,
+    const bool,
+    const std::size_t,
+    const std::size_t,
+    const char *,
+    const ssize_t,
+    const char *,
+    const ssize_t,
+    char *,
+    const ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+class isin_contig_impl_krn;
+
+template <typename T>
+sycl::event isin_contig_impl(sycl::queue &exec_q,
+                             const bool invert,
+                             const std::size_t hay_nelems,
+                             const std::size_t needles_nelems,
+                             const char *hay_cp,
+                             const ssize_t hay_offset,
+                             const char *needles_cp,
+                             const ssize_t needles_offset,
+                             char *out_cp,
+                             const ssize_t out_offset,
+                             const std::vector<sycl::event> &depends)
+{
+    const T *hay_tp = reinterpret_cast<const T *>(hay_cp) + hay_offset;
+    const T *needles_tp =
+        reinterpret_cast<const T *>(needles_cp) + needles_offset;
+
+    bool *out_tp = reinterpret_cast<bool *>(out_cp) + out_offset;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName = class isin_contig_impl_krn<T>;
+
+        sycl::range<1> gRange(needles_nelems);
+
+        using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        static constexpr TrivialIndexerT hay_indexer{};
+        static constexpr TrivialIndexerT needles_indexer{};
+        static constexpr TrivialIndexerT out_indexer{};
+
+        const auto fnctr =
+            IsinFunctor<T, TrivialIndexerT, TrivialIndexerT, TrivialIndexerT>(
+                invert, hay_tp, needles_tp, out_tp, hay_nelems, hay_indexer,
+                needles_indexer, out_indexer);
+
+        cgh.parallel_for<KernelName>(gRange, fnctr);
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*isin_strided_impl_fp_ptr_t)(
+    sycl::queue &,
+    const bool,
+    const std::size_t,
+    const std::size_t,
+    const char *,
+    const ssize_t,
+    const ssize_t,
+    const char *,
+    const ssize_t,
+    char *,
+    const ssize_t,
+    int,
+    const ssize_t *,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+class isin_strided_impl_krn;
+
+template <typename T>
+sycl::event isin_strided_impl(
+    sycl::queue &exec_q,
+    const bool invert,
+    const std::size_t hay_nelems,
+    const std::size_t needles_nelems,
+    const char *hay_cp,
+    const ssize_t hay_offset,
+    // hay is 1D, so hay_nelems, hay_offset, hay_stride describe strided array
+    const ssize_t hay_stride,
+    const char *needles_cp,
+    const ssize_t needles_offset,
+    char *out_cp,
+    const ssize_t out_offset,
+    const int needles_nd,
+    // packed_shape_strides is [needles_shape, needles_strides,
+    // out_strides] has length of 3*needles_nd
+    const ssize_t *packed_shape_strides,
+    const std::vector<sycl::event> &depends)
+{
+    const T *hay_tp = reinterpret_cast<const T *>(hay_cp);
+    const T *needles_tp = reinterpret_cast<const T *>(needles_cp);
+
+    bool *out_tp = reinterpret_cast<bool *>(out_cp);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::range<1> gRange(needles_nelems);
+
+        using HayIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        const HayIndexerT hay_indexer(
+            /* offset */ hay_offset,
+            /* size   */ hay_nelems,
+            /* step   */ hay_stride);
+
+        using NeedlesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const ssize_t *needles_shape_strides = packed_shape_strides;
+        const NeedlesIndexerT needles_indexer(needles_nd, needles_offset,
+                                              needles_shape_strides);
+        using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+        const ssize_t *out_shape = packed_shape_strides;
+        const ssize_t *out_strides = packed_shape_strides + 2 * needles_nd;
+        const OutIndexerT out_indexer(needles_nd, out_offset, out_shape,
+                                      out_strides);
+
+        const auto fnctr =
+            IsinFunctor<T, HayIndexerT, NeedlesIndexerT, OutIndexerT>(
+                invert, hay_tp, needles_tp, out_tp, hay_nelems, hay_indexer,
+                needles_indexer, out_indexer);
+        using KernelName = class isin_strided_impl_krn<T>;
+
+        cgh.parallel_for<KernelName>(gRange, fnctr);
+    });
+
+    return comp_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
new file mode 100644
index 000000000000..75d3dc5f01a0
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
@@ -0,0 +1,844 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor sort/argsort operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "kernels/sorting/sort_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+namespace merge_sort_detail
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::kernels::search_sorted_detail;
+
+/*! @brief Merge two contiguous sorted segments */
+template <typename InAcc, typename OutAcc, typename Compare>
+void merge_impl(const std::size_t offset,
+                const InAcc in_acc,
+                OutAcc out_acc,
+                const std::size_t start_1,
+                const std::size_t end_1,
+                const std::size_t end_2,
+                const std::size_t start_out,
+                Compare comp,
+                const std::size_t chunk)
+{
+    const std::size_t start_2 = end_1;
+    // Borders of the sequences to merge within this call
+    const std::size_t local_start_1 = sycl::min(offset + start_1, end_1);
+    const std::size_t local_end_1 = sycl::min(local_start_1 + chunk, end_1);
+    const std::size_t local_start_2 = sycl::min(offset + start_2, end_2);
+    const std::size_t local_end_2 = sycl::min(local_start_2 + chunk, end_2);
+
+    const std::size_t local_size_1 = local_end_1 - local_start_1;
+    const std::size_t local_size_2 = local_end_2 - local_start_2;
+
+    const auto r_item_1 = in_acc[end_1 - 1];
+    const auto l_item_2 = (start_2 < end_2) ? in_acc[start_2] : r_item_1;
+
+    // Copy if the sequences are sorted with respect to each other or merge
+    // otherwise
+    if (!comp(l_item_2, r_item_1)) {
+        const std::size_t out_shift_1 = start_out + local_start_1 - start_1;
+        const std::size_t out_shift_2 =
+            start_out + end_1 - start_1 + local_start_2 - start_2;
+
+        for (std::size_t i = 0; i < local_size_1; ++i) {
+            out_acc[out_shift_1 + i] = in_acc[local_start_1 + i];
+        }
+        for (std::size_t i = 0; i < local_size_2; ++i) {
+            out_acc[out_shift_2 + i] = in_acc[local_start_2 + i];
+        }
+    }
+    else if (comp(r_item_1, l_item_2)) {
+        const std::size_t out_shift_1 =
+            start_out + end_2 - start_2 + local_start_1 - start_1;
+        const std::size_t out_shift_2 = start_out + local_start_2 - start_2;
+        for (std::size_t i = 0; i < local_size_1; ++i) {
+            out_acc[out_shift_1 + i] = in_acc[local_start_1 + i];
+        }
+        for (std::size_t i = 0; i < local_size_2; ++i) {
+            out_acc[out_shift_2 + i] = in_acc[local_start_2 + i];
+        }
+    }
+    // Perform merging
+    else {
+
+        // Process 1st sequence
+        if (local_start_1 < local_end_1) {
+            // Reduce the range for searching within the 2nd sequence and handle
+            // bound items find left border in 2nd sequence
+            const auto local_l_item_1 = in_acc[local_start_1];
+            std::size_t l_search_bound_2 =
+                lower_bound_impl(in_acc, start_2, end_2, local_l_item_1, comp);
+            const std::size_t l_shift_1 = local_start_1 - start_1;
+            const std::size_t l_shift_2 = l_search_bound_2 - start_2;
+
+            out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_1;
+
+            std::size_t r_search_bound_2{};
+            // find right border in 2nd sequence
+            if (local_size_1 > 1) {
+                const auto local_r_item_1 = in_acc[local_end_1 - 1];
+                r_search_bound_2 = lower_bound_impl(
+                    in_acc, l_search_bound_2, end_2, local_r_item_1, comp);
+                const auto r_shift_1 = local_end_1 - 1 - start_1;
+                const auto r_shift_2 = r_search_bound_2 - start_2;
+
+                out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_1;
+            }
+
+            // Handle intermediate items
+            if (r_search_bound_2 == l_search_bound_2) {
+                const std::size_t shift_2 = l_search_bound_2 - start_2;
+                for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1;
+                     ++idx) {
+                    const auto intermediate_item_1 = in_acc[idx];
+                    const std::size_t shift_1 = idx - start_1;
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_1;
+                }
+            }
+            else {
+                for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1;
+                     ++idx) {
+                    const auto intermediate_item_1 = in_acc[idx];
+                    // we shouldn't seek in whole 2nd sequence. Just for the
+                    // part where the 1st sequence should be
+                    l_search_bound_2 = lower_bound_impl(
+                        in_acc, l_search_bound_2, r_search_bound_2,
+                        intermediate_item_1, comp);
+                    const std::size_t shift_1 = idx - start_1;
+                    const std::size_t shift_2 = l_search_bound_2 - start_2;
+
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_1;
+                }
+            }
+        }
+        // Process 2nd sequence
+        if (local_start_2 < local_end_2) {
+            // Reduce the range for searching within the 1st sequence and handle
+            // bound items find left border in 1st sequence
+            const auto local_l_item_2 = in_acc[local_start_2];
+            std::size_t l_search_bound_1 =
+                upper_bound_impl(in_acc, start_1, end_1, local_l_item_2, comp);
+            const std::size_t l_shift_1 = l_search_bound_1 - start_1;
+            const std::size_t l_shift_2 = local_start_2 - start_2;
+
+            out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_2;
+
+            std::size_t r_search_bound_1{};
+            // find right border in 1st sequence
+            if (local_size_2 > 1) {
+                const auto local_r_item_2 = in_acc[local_end_2 - 1];
+                r_search_bound_1 = upper_bound_impl(
+                    in_acc, l_search_bound_1, end_1, local_r_item_2, comp);
+                const std::size_t r_shift_1 = r_search_bound_1 - start_1;
+                const std::size_t r_shift_2 = local_end_2 - 1 - start_2;
+
+                out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_2;
+            }
+
+            // Handle intermediate items
+            if (l_search_bound_1 == r_search_bound_1) {
+                const std::size_t shift_1 = l_search_bound_1 - start_1;
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1;
+                     ++idx) {
+                    const auto intermediate_item_2 = in_acc[idx];
+                    const std::size_t shift_2 = idx - start_2;
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_2;
+                }
+            }
+            else {
+                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1;
+                     ++idx) {
+                    const auto intermediate_item_2 = in_acc[idx];
+                    // we shouldn't seek in whole 1st sequence. Just for the
+                    // part where the 2nd sequence should be
+                    l_search_bound_1 = upper_bound_impl(
+                        in_acc, l_search_bound_1, r_search_bound_1,
+                        intermediate_item_2, comp);
+                    const std::size_t shift_1 = l_search_bound_1 - start_1;
+                    const std::size_t shift_2 = idx - start_2;
+
+                    out_acc[start_out + shift_1 + shift_2] =
+                        intermediate_item_2;
+                }
+            }
+        }
+    }
+}
+
+template <typename Iter, typename Compare>
+void insertion_sort_impl(Iter &&first,
+                         std::size_t begin,
+                         std::size_t end,
+                         Compare &&comp)
+{
+    for (std::size_t i = begin + 1; i < end; ++i) {
+        const auto val_i = first[i];
+        std::size_t j = i - 1;
+        while ((j + 1 > begin) && (comp(val_i, first[j]))) {
+            first[j + 1] = first[j];
+            --j;
+        }
+        if (j + 1 < i) {
+            first[j + 1] = val_i;
+        }
+    }
+}
+
+template <typename Iter, typename Compare>
+void leaf_sort_impl(Iter &&first,
+                    std::size_t begin,
+                    std::size_t end,
+                    Compare &&comp)
+{
+    return insertion_sort_impl<Iter, Compare>(std::forward<Iter>(first),
+                                              std::move(begin), std::move(end),
+                                              std::forward<Compare>(comp));
+}
+
+template <typename Iter>
+struct GetValueType
+{
+    using value_type = typename std::iterator_traits<Iter>::value_type;
+};
+
+template <typename ElementType,
+          sycl::access::address_space Space,
+          sycl::access::decorated IsDecorated>
+struct GetValueType<sycl::multi_ptr<ElementType, Space, IsDecorated>>
+{
+    using value_type = ElementType;
+};
+
+template <typename ElementType,
+          int Dim,
+          sycl::access_mode Mode,
+          sycl::target Target,
+          sycl::access::placeholder isPlaceholder>
+struct GetValueType<
+    sycl::accessor<ElementType, Dim, Mode, Target, isPlaceholder>>
+{
+    using value_type = ElementType;
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetValueType<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    using value_type = ElementType;
+};
+
+template <typename Iter>
+struct GetReadOnlyAccess
+{
+    Iter operator()(const Iter &it, sycl::handler &) { return it; }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetReadOnlyAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(const sycl::buffer<ElementType, Dim, AllocatorT> &buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::read_only);
+        return acc;
+    }
+};
+
+template <typename Iter>
+struct GetWriteDiscardAccess
+{
+    Iter operator()(Iter it, sycl::handler &) { return it; }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetWriteDiscardAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> &buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::write_only, sycl::no_init);
+        return acc;
+    }
+};
+
+template <typename Iter>
+struct GetReadWriteAccess
+{
+    Iter operator()(Iter &it, sycl::handler &) { return it; }
+};
+
+template <typename ElementType, int Dim, typename AllocatorT>
+struct GetReadWriteAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
+{
+    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> &buf,
+                    sycl::handler &cgh)
+    {
+        sycl::accessor acc(buf, cgh, sycl::read_write);
+        return acc;
+    }
+};
+
+template <typename T1, typename T2, typename Comp>
+class sort_base_step_contig_krn;
+
+template <typename InpAcc, typename OutAcc, typename Comp>
+sycl::event
+    sort_base_step_contig_impl(sycl::queue &q,
+                               const std::size_t iter_nelems,
+                               const std::size_t sort_nelems,
+                               const InpAcc input,
+                               OutAcc output,
+                               const Comp &comp,
+                               const std::size_t conseq_nelems_sorted,
+                               const std::vector<sycl::event> &depends = {})
+{
+
+    using inpT = typename GetValueType<InpAcc>::value_type;
+    using outT = typename GetValueType<OutAcc>::value_type;
+    using KernelName = sort_base_step_contig_krn<inpT, outT, Comp>;
+
+    const std::size_t n_segments =
+        quotient_ceil(sort_nelems, conseq_nelems_sorted);
+
+    sycl::event base_sort = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const sycl::range<1> gRange{iter_nelems * n_segments};
+
+        auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
+        auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
+
+        cgh.parallel_for<KernelName>(gRange, [=](sycl::id<1> id) {
+            const std::size_t iter_id = id[0] / n_segments;
+            const std::size_t segment_id = id[0] - iter_id * n_segments;
+
+            const std::size_t iter_offset = iter_id * sort_nelems;
+            const std::size_t beg_id =
+                iter_offset + segment_id * conseq_nelems_sorted;
+            const std::size_t end_id =
+                iter_offset +
+                std::min((segment_id + 1) * conseq_nelems_sorted, sort_nelems);
+            for (std::size_t i = beg_id; i < end_id; ++i) {
+                output_acc[i] = input_acc[i];
+            }
+
+            leaf_sort_impl(output_acc, beg_id, end_id, comp);
+        });
+    });
+
+    return base_sort;
+}
+
+template <typename T1, typename T2, typename Comp>
+class sort_over_work_group_contig_krn;
+
+template <typename InpAcc, typename OutAcc, typename Comp>
+sycl::event sort_over_work_group_contig_impl(
+    sycl::queue &q,
+    std::size_t iter_nelems,
+    std::size_t sort_nelems,
+    const InpAcc input,
+    OutAcc output,
+    const Comp &comp,
+    std::size_t &nelems_wg_sorts,
+    const std::vector<sycl::event> &depends = {})
+{
+    using inpT = typename GetValueType<InpAcc>::value_type;
+    using T = typename GetValueType<OutAcc>::value_type;
+    using KernelName = sort_over_work_group_contig_krn<inpT, T, Comp>;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = q.get_context();
+    auto const &dev = q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t max_sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+    const std::uint64_t device_local_memory_size =
+        dev.get_info<sycl::info::device::local_mem_size>();
+
+    //  leave 512 bytes of local memory for RT
+    const std::uint64_t safety_margin = 512;
+
+    const std::uint64_t nelems_per_slm =
+        (device_local_memory_size - safety_margin) / (2 * sizeof(T));
+
+    static constexpr std::uint32_t sub_groups_per_work_group = 4;
+    const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2;
+
+    const std::size_t lws = sub_groups_per_work_group * max_sg_size;
+
+    nelems_wg_sorts = elems_per_wi * lws;
+
+    if (nelems_wg_sorts > nelems_per_slm) {
+        nelems_wg_sorts = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4);
+
+        return sort_base_step_contig_impl<InpAcc, OutAcc, Comp>(
+            q, iter_nelems, sort_nelems, input, output, comp, nelems_wg_sorts,
+            depends);
+    }
+
+    // This assumption permits doing away with using a loop
+    assert(nelems_wg_sorts % lws == 0);
+
+    const std::size_t n_segments = quotient_ceil(sort_nelems, nelems_wg_sorts);
+
+    sycl::event base_sort_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.use_kernel_bundle(kb);
+
+        sycl::range<1> global_range{iter_nelems * n_segments * lws};
+        sycl::range<1> local_range{lws};
+
+        sycl::range<1> slm_range{nelems_wg_sorts};
+        sycl::local_accessor<T, 1> work_space(slm_range, cgh);
+        sycl::local_accessor<T, 1> scratch_space(slm_range, cgh);
+
+        auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
+        auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
+
+        sycl::nd_range<1> ndRange(global_range, local_range);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t group_id = it.get_group_linear_id();
+            const std::size_t iter_id = group_id / n_segments;
+            const std::size_t segment_id = group_id - iter_id * n_segments;
+            const std::size_t lid = it.get_local_linear_id();
+
+            const std::size_t segment_start_idx = segment_id * nelems_wg_sorts;
+            const std::size_t segment_end_idx =
+                std::min(segment_start_idx + nelems_wg_sorts, sort_nelems);
+            const std::size_t wg_chunk_size =
+                segment_end_idx - segment_start_idx;
+
+            // load input into SLM
+            for (std::size_t array_id = segment_start_idx + lid;
+                 array_id < segment_end_idx; array_id += lws) {
+                T v = (array_id < sort_nelems)
+                          ? input_acc[iter_id * sort_nelems + array_id]
+                          : T{};
+                work_space[array_id - segment_start_idx] = v;
+            }
+            sycl::group_barrier(it.get_group());
+
+            const std::size_t chunk = quotient_ceil(nelems_wg_sorts, lws);
+
+            const std::size_t chunk_start_idx = lid * chunk;
+            const std::size_t chunk_end_idx =
+                sycl::min(chunk_start_idx + chunk, wg_chunk_size);
+
+            leaf_sort_impl(work_space, chunk_start_idx, chunk_end_idx, comp);
+
+            sycl::group_barrier(it.get_group());
+
+            bool data_in_temp = false;
+            std::size_t n_chunks_merged = 1;
+
+            // merge chunk while n_chunks_merged * chunk < wg_chunk_size
+            const std::size_t max_chunks_merged =
+                1 + ((wg_chunk_size - 1) / chunk);
+            for (; n_chunks_merged < max_chunks_merged;
+                 data_in_temp = !data_in_temp, n_chunks_merged *= 2) {
+                const std::size_t nelems_sorted_so_far =
+                    n_chunks_merged * chunk;
+                const std::size_t q = (lid / n_chunks_merged);
+                const std::size_t start_1 =
+                    sycl::min(2 * nelems_sorted_so_far * q, wg_chunk_size);
+                const std::size_t end_1 =
+                    sycl::min(start_1 + nelems_sorted_so_far, wg_chunk_size);
+                const std::size_t end_2 =
+                    sycl::min(end_1 + nelems_sorted_so_far, wg_chunk_size);
+                const std::size_t offset = chunk * (lid - q * n_chunks_merged);
+
+                if (data_in_temp) {
+                    merge_impl(offset, scratch_space, work_space, start_1,
+                               end_1, end_2, start_1, comp, chunk);
+                }
+                else {
+                    merge_impl(offset, work_space, scratch_space, start_1,
+                               end_1, end_2, start_1, comp, chunk);
+                }
+                sycl::group_barrier(it.get_group());
+            }
+
+            const auto &out_src = (data_in_temp) ? scratch_space : work_space;
+            for (std::size_t array_id = segment_start_idx + lid;
+                 array_id < segment_end_idx; array_id += lws) {
+                if (array_id < sort_nelems) {
+                    output_acc[iter_id * sort_nelems + array_id] =
+                        out_src[array_id - segment_start_idx];
+                }
+            }
+        });
+    });
+
+    return base_sort_ev;
+}
+
+class vacuous_krn;
+
+inline sycl::event tie_events(sycl::queue &q,
+                              const std::vector<sycl::event> depends)
+{
+    if (depends.empty())
+        return sycl::event();
+    if (depends.size() == 1)
+        return depends[0];
+
+    sycl::event e = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        using KernelName = vacuous_krn;
+        cgh.single_task<KernelName>([]() {});
+    });
+
+    return e;
+}
+
+template <typename T, typename Comp>
+class merge_adjacent_blocks_to_temp_krn;
+
+template <typename T, typename Comp>
+class merge_adjacent_blocks_from_temp_krn;
+
+template <typename Acc, typename Comp>
+sycl::event
+    merge_sorted_block_contig_impl(sycl::queue &q,
+                                   std::size_t iter_nelems,
+                                   std::size_t sort_nelems,
+                                   Acc output,
+                                   const Comp comp,
+                                   std::size_t sorted_block_size,
+                                   const std::vector<sycl::event> &depends = {})
+{
+
+    if (sorted_block_size >= sort_nelems)
+        return tie_events(q, depends);
+
+    // experimentally determined value
+    // size of segments worked upon by each work-item during merging
+    const sycl::device &dev = q.get_device();
+    const std::size_t segment_size = (dev.has(sycl::aspect::cpu)) ? 32 : 4;
+
+    const std::size_t chunk_size =
+        (sorted_block_size < segment_size) ? sorted_block_size : segment_size;
+
+    assert(sorted_block_size % chunk_size == 0);
+
+    using T = typename GetValueType<Acc>::value_type;
+
+    sycl::buffer<T, 1> temp_buf(sycl::range<1>{iter_nelems * sort_nelems});
+    // T *allocated_mem = sycl::malloc_device<T>(iter_nelems * sort_nelems, q);
+
+    bool needs_copy = true;
+    bool used_depends = false;
+
+    sycl::event dep_ev;
+    std::size_t chunks_merged = sorted_block_size / chunk_size;
+
+    assert(!(chunks_merged & (chunks_merged - 1)));
+
+    using ToTempKernelName = class merge_adjacent_blocks_to_temp_krn<T, Comp>;
+    using FromTempKernelName =
+        class merge_adjacent_blocks_from_temp_krn<T, Comp>;
+
+    while (chunks_merged * chunk_size < sort_nelems) {
+        sycl::event local_dep = dep_ev;
+
+        sycl::event merge_ev = q.submit([&](sycl::handler &cgh) {
+            if (used_depends) {
+                cgh.depends_on(local_dep);
+            }
+            else {
+                cgh.depends_on(depends);
+                used_depends = true;
+            }
+
+            const std::size_t n_chunks = quotient_ceil(sort_nelems, chunk_size);
+
+            if (needs_copy) {
+                sycl::accessor temp_acc{temp_buf, cgh, sycl::write_only,
+                                        sycl::no_init};
+                auto output_acc = GetReadOnlyAccess<Acc>{}(output, cgh);
+                cgh.parallel_for<ToTempKernelName>(
+                    {iter_nelems * n_chunks}, [=](sycl::id<1> wid) {
+                        auto flat_idx = wid[0];
+                        auto iter_idx = flat_idx / n_chunks;
+                        auto idx = flat_idx - n_chunks * iter_idx;
+
+                        const std::size_t idx_mult =
+                            (idx / chunks_merged) * chunks_merged;
+                        const std::size_t idx_rem = (idx - idx_mult);
+                        const std::size_t start_1 =
+                            sycl::min(2 * idx_mult * chunk_size, sort_nelems);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t offset = chunk_size * idx_rem;
+
+                        const std::size_t iter_offset = iter_idx * sort_nelems;
+
+                        merge_impl(offset, output_acc, temp_acc,
+                                   iter_offset + start_1, iter_offset + end_1,
+                                   iter_offset + end_2, iter_offset + start_1,
+                                   comp, chunk_size);
+                    });
+            }
+            else {
+                sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only};
+                auto output_acc = GetWriteDiscardAccess<Acc>{}(output, cgh);
+                cgh.parallel_for<FromTempKernelName>(
+                    {iter_nelems * n_chunks}, [=](sycl::id<1> wid) {
+                        auto flat_idx = wid[0];
+                        auto iter_idx = flat_idx / n_chunks;
+                        auto idx = flat_idx - n_chunks * iter_idx;
+
+                        const std::size_t idx_mult =
+                            (idx / chunks_merged) * chunks_merged;
+                        const std::size_t idx_rem = (idx - idx_mult);
+                        const std::size_t start_1 =
+                            sycl::min(2 * idx_mult * chunk_size, sort_nelems);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + chunks_merged * chunk_size, sort_nelems);
+                        const std::size_t offset = chunk_size * idx_rem;
+
+                        const std::size_t iter_offset = iter_idx * sort_nelems;
+
+                        merge_impl(offset, temp_acc, output_acc,
+                                   iter_offset + start_1, iter_offset + end_1,
+                                   iter_offset + end_2, iter_offset + start_1,
+                                   comp, chunk_size);
+                    });
+            }
+        });
+
+        chunks_merged *= 2;
+        dep_ev = merge_ev;
+
+        if (chunks_merged * chunk_size < sort_nelems) {
+            needs_copy = !needs_copy;
+        }
+    }
+
+    if (needs_copy) {
+        sycl::event copy_ev = q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dep_ev);
+
+            sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only};
+            auto output_acc = GetWriteDiscardAccess<Acc>{}(output, cgh);
+
+            cgh.copy(temp_acc, output_acc);
+        });
+        dep_ev = copy_ev;
+    }
+
+    return dep_ev;
+}
+
+} // namespace merge_sort_detail
+
+template <typename argTy, typename Comp = std::less<argTy>>
+sycl::event stable_sort_axis1_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
+                             // matrix when sorting over rows)
+    std::size_t sort_nelems, // size of each array to sort  (length of rows,
+                             // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t sort_arg_offset,
+    ssize_t sort_res_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    argTy *res_tp =
+        reinterpret_cast<argTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    auto comp = Comp{};
+
+    // constant chosen experimentally to ensure monotonicity of
+    // sorting performance, as measured on GPU Max, and Iris Xe
+    constexpr std::size_t sequential_sorting_threshold = 16;
+
+    if (sort_nelems < sequential_sorting_threshold) {
+        // equal work-item sorts entire row
+        sycl::event sequential_sorting_ev =
+            merge_sort_detail::sort_base_step_contig_impl<const argTy *,
+                                                          argTy *, Comp>(
+                exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
+                sort_nelems, depends);
+
+        return sequential_sorting_ev;
+    }
+    else {
+        std::size_t sorted_block_size{};
+
+        // Sort segments of the array
+        sycl::event base_sort_ev =
+            merge_sort_detail::sort_over_work_group_contig_impl<const argTy *,
+                                                                argTy *, Comp>(
+                exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
+                sorted_block_size, // modified in place with size of sorted
+                                   // block size
+                depends);
+
+        // Merge segments in parallel until all elements are sorted
+        sycl::event merges_ev =
+            merge_sort_detail::merge_sorted_block_contig_impl<argTy *, Comp>(
+                exec_q, iter_nelems, sort_nelems, res_tp, comp,
+                sorted_block_size, {base_sort_ev});
+
+        return merges_ev;
+    }
+}
+
+template <typename T1, typename T2>
+class populate_index_data_krn;
+
+template <typename T1, typename T2>
+class index_map_to_rows_krn;
+
+template <typename IndexT, typename ValueT, typename ValueComp>
+struct IndexComp
+{
+    IndexComp(const ValueT *data, const ValueComp &comp_op)
+        : ptr(data), value_comp(comp_op)
+    {
+    }
+
+    bool operator()(const IndexT &i1, const IndexT &i2) const
+    {
+        return value_comp(ptr[i1], ptr[i2]);
+    }
+
+private:
+    const ValueT *ptr;
+    ValueComp value_comp;
+};
+
+template <typename argTy,
+          typename IndexTy,
+          typename ValueComp = std::less<argTy>>
+sycl::event stable_argsort_axis1_contig_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
+                             // matrix when sorting over rows)
+    std::size_t sort_nelems, // size of each array to sort  (length of rows,
+                             // i.e. number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    ssize_t iter_arg_offset,
+    ssize_t iter_res_offset,
+    ssize_t sort_arg_offset,
+    ssize_t sort_res_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    IndexTy *res_tp =
+        reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    const IndexComp<IndexTy, argTy, ValueComp> index_comp{arg_tp, ValueComp{}};
+
+    static constexpr std::size_t determine_automatically = 0;
+    std::size_t sorted_block_size = determine_automatically;
+
+    const std::size_t total_nelems = iter_nelems * sort_nelems;
+
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    using IotaKernelName = populate_index_data_krn<argTy, IndexTy>;
+
+    sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, res_tp, total_nelems, depends);
+
+    // Sort segments of the array
+    sycl::event base_sort_ev =
+        merge_sort_detail::sort_over_work_group_contig_impl(
+            exec_q, iter_nelems, sort_nelems, res_tp, res_tp, index_comp,
+            sorted_block_size, // modified in place with size of sorted block
+                               // size
+            {populate_indexed_data_ev});
+
+    // Merge segments in parallel until all elements are sorted
+    sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl(
+        exec_q, iter_nelems, sort_nelems, res_tp, index_comp, sorted_block_size,
+        {base_sort_ev});
+
+    // no need to map back if iter_nelems == 1
+    if (iter_nelems == 1u) {
+        return merges_ev;
+    }
+
+    using MapBackKernelName = index_map_to_rows_krn<argTy, IndexTy>;
+    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
+
+    sycl::event write_out_ev = map_back_impl<MapBackKernelName, IndexTy>(
+        exec_q, total_nelems, res_tp, res_tp, sort_nelems, {merges_ev});
+
+    return write_out_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
new file mode 100644
index 000000000000..5baa98e237df
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -0,0 +1,1905 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/sort_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+namespace radix_sort_details
+{
+
+template <std::uint32_t, bool, typename... TrailingNames>
+class radix_sort_count_kernel;
+
+template <std::uint32_t, typename... TrailingNames>
+class radix_sort_scan_kernel;
+
+template <std::uint32_t, bool, typename... TrailingNames>
+class radix_sort_reorder_peer_kernel;
+
+template <std::uint32_t, bool, typename... TrailingNames>
+class radix_sort_reorder_kernel;
+
+/*! @brief Computes smallest exponent such that `n <= (1 << exponent)` */
+template <typename SizeT,
+          std::enable_if_t<std::is_unsigned_v<SizeT> &&
+                               sizeof(SizeT) == sizeof(std::uint64_t),
+                           int> = 0>
+std::uint32_t ceil_log2(SizeT n)
+{
+    // if n > 2^b, n = q * 2^b + r for q > 0 and 0 <= r < 2^b
+    // floor_log2(q * 2^b + r) == floor_log2(q * 2^b) == q + floor_log2(n1)
+    // ceil_log2(n) == 1 + floor_log2(n-1)
+    if (n <= 1)
+        return std::uint32_t{1};
+
+    std::uint32_t exp{1};
+    --n;
+    if (n >= (SizeT{1} << 32)) {
+        n >>= 32;
+        exp += 32;
+    }
+    if (n >= (SizeT{1} << 16)) {
+        n >>= 16;
+        exp += 16;
+    }
+    if (n >= (SizeT{1} << 8)) {
+        n >>= 8;
+        exp += 8;
+    }
+    if (n >= (SizeT{1} << 4)) {
+        n >>= 4;
+        exp += 4;
+    }
+    if (n >= (SizeT{1} << 2)) {
+        n >>= 2;
+        exp += 2;
+    }
+    if (n >= (SizeT{1} << 1)) {
+        n >>= 1;
+        ++exp;
+    }
+    return exp;
+}
+
+//----------------------------------------------------------
+// bitwise order-preserving conversions to unsigned integers
+//----------------------------------------------------------
+
+template <bool is_ascending>
+bool order_preserving_cast(bool val)
+{
+    if constexpr (is_ascending)
+        return val;
+    else
+        return !val;
+}
+
+template <bool is_ascending,
+          typename UIntT,
+          std::enable_if_t<std::is_unsigned_v<UIntT>, int> = 0>
+UIntT order_preserving_cast(UIntT val)
+{
+    if constexpr (is_ascending) {
+        return val;
+    }
+    else {
+        // bitwise invert
+        return (~val);
+    }
+}
+
+template <bool is_ascending,
+          typename IntT,
+          std::enable_if_t<std::is_integral_v<IntT> && std::is_signed_v<IntT>,
+                           int> = 0>
+std::make_unsigned_t<IntT> order_preserving_cast(IntT val)
+{
+    using UIntT = std::make_unsigned_t<IntT>;
+    const UIntT uint_val = sycl::bit_cast<UIntT>(val);
+
+    if constexpr (is_ascending) {
+        // ascending_mask: 100..0
+        static constexpr UIntT ascending_mask =
+            (UIntT(1) << std::numeric_limits<IntT>::digits);
+        return (uint_val ^ ascending_mask);
+    }
+    else {
+        // descending_mask: 011..1
+        static constexpr UIntT descending_mask =
+            (std::numeric_limits<UIntT>::max() >> 1);
+        return (uint_val ^ descending_mask);
+    }
+}
+
+template <bool is_ascending>
+std::uint16_t order_preserving_cast(sycl::half val)
+{
+    using UIntT = std::uint16_t;
+
+    const UIntT uint_val = sycl::bit_cast<UIntT>(
+        (sycl::isnan(val)) ? std::numeric_limits<sycl::half>::quiet_NaN()
+                           : val);
+    UIntT mask;
+
+    // test the sign bit of the original value
+    const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 15));
+
+    static constexpr UIntT zero_mask = UIntT(0x8000u);
+    static constexpr UIntT nonzero_mask = UIntT(0xFFFFu);
+
+    static constexpr UIntT inv_zero_mask = static_cast<UIntT>(~zero_mask);
+    static constexpr UIntT inv_nonzero_mask = static_cast<UIntT>(~nonzero_mask);
+
+    if constexpr (is_ascending) {
+        mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
+    }
+    else {
+        mask = (zero_fp_sign_bit) ? (inv_zero_mask) : (inv_nonzero_mask);
+    }
+
+    return (uint_val ^ mask);
+}
+
+template <bool is_ascending,
+          typename FloatT,
+          std::enable_if_t<std::is_floating_point_v<FloatT> &&
+                               sizeof(FloatT) == sizeof(std::uint32_t),
+                           int> = 0>
+std::uint32_t order_preserving_cast(FloatT val)
+{
+    using UIntT = std::uint32_t;
+
+    UIntT uint_val = sycl::bit_cast<UIntT>(
+        (sycl::isnan(val)) ? std::numeric_limits<FloatT>::quiet_NaN() : val);
+
+    UIntT mask;
+
+    // test the sign bit of the original value
+    const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 31));
+
+    static constexpr UIntT zero_mask = UIntT(0x80000000u);
+    static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFu);
+
+    if constexpr (is_ascending)
+        mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
+    else
+        mask = (zero_fp_sign_bit) ? (~zero_mask) : (~nonzero_mask);
+
+    return (uint_val ^ mask);
+}
+
+template <bool is_ascending,
+          typename FloatT,
+          std::enable_if_t<std::is_floating_point_v<FloatT> &&
+                               sizeof(FloatT) == sizeof(std::uint64_t),
+                           int> = 0>
+std::uint64_t order_preserving_cast(FloatT val)
+{
+    using UIntT = std::uint64_t;
+
+    UIntT uint_val = sycl::bit_cast<UIntT>(
+        (sycl::isnan(val)) ? std::numeric_limits<FloatT>::quiet_NaN() : val);
+    UIntT mask;
+
+    // test the sign bit of the original value
+    const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 63));
+
+    static constexpr UIntT zero_mask = UIntT(0x8000000000000000u);
+    static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFFFFFFFFFu);
+
+    if constexpr (is_ascending)
+        mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
+    else
+        mask = (zero_fp_sign_bit) ? (~zero_mask) : (~nonzero_mask);
+
+    return (uint_val ^ mask);
+}
+
+//-----------------
+// bucket functions
+//-----------------
+
+template <typename T>
+constexpr std::size_t number_of_bits_in_type()
+{
+    constexpr std::size_t type_bits =
+        (sizeof(T) * std::numeric_limits<unsigned char>::digits);
+    return type_bits;
+}
+
+// the number of buckets (size of radix bits) in T
+template <typename T>
+constexpr std::uint32_t number_of_buckets_in_type(std::uint32_t radix_bits)
+{
+    constexpr std::size_t type_bits = number_of_bits_in_type<T>();
+    return (type_bits + radix_bits - 1) / radix_bits;
+}
+
+// get bits value (bucket) in a certain radix position
+template <std::uint32_t radix_mask, typename T>
+std::uint32_t get_bucket_id(T val, std::uint32_t radix_offset)
+{
+    static_assert(std::is_unsigned_v<T>);
+
+    return (val >> radix_offset) & T(radix_mask);
+}
+
+//--------------------------------
+// count kernel (single iteration)
+//--------------------------------
+
+template <typename KernelName,
+          std::uint32_t radix_bits,
+          typename ValueT,
+          typename CountT,
+          typename Proj>
+sycl::event
+    radix_sort_count_submit(sycl::queue &exec_q,
+                            std::size_t n_iters,
+                            std::size_t n_segments,
+                            std::size_t wg_size,
+                            std::uint32_t radix_offset,
+                            std::size_t n_values,
+                            ValueT *vals_ptr,
+                            std::size_t n_counts,
+                            CountT *counts_ptr,
+                            const Proj &proj_op,
+                            const bool is_ascending,
+                            const std::vector<sycl::event> &dependency_events)
+{
+    // bin_count = radix_states used for an array storing bucket state counters
+    static constexpr std::uint32_t radix_states =
+        (std::uint32_t(1) << radix_bits);
+    static constexpr std::uint32_t radix_mask = radix_states - 1;
+
+    // iteration space info
+    const std::size_t n = n_values;
+    // each segment is processed by a work-group
+    const std::size_t elems_per_segment = (n + n_segments - 1) / n_segments;
+    const std::size_t no_op_flag_id = n_counts - 1;
+
+    assert(n_counts == (n_segments + 1) * radix_states + 1);
+
+    sycl::event local_count_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependency_events);
+
+        sycl::local_accessor<CountT, 1> counts_lacc(wg_size * radix_states,
+                                                    cgh);
+
+        sycl::nd_range<1> ndRange(n_iters * n_segments * wg_size, wg_size);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
+            // 0 <= lid < wg_size
+            const std::size_t lid = ndit.get_local_id(0);
+            // 0 <= group_id < n_segments * n_iters
+            const std::size_t group_id = ndit.get_group(0);
+            const std::size_t iter_id = group_id / n_segments;
+            const std::size_t val_iter_offset = iter_id * n;
+            // 0 <= wgr_id < n_segments
+            const std::size_t wgr_id = group_id - iter_id * n_segments;
+
+            const std::size_t seg_start = elems_per_segment * wgr_id;
+
+            // count per work-item: create a private array for storing count
+            // values here bin_count = radix_states
+            std::array<CountT, radix_states> counts_arr = {CountT{0}};
+
+            // count per work-item: count values and write result to private
+            // count array
+            const std::size_t seg_end =
+                sycl::min(seg_start + elems_per_segment, n);
+            if (is_ascending) {
+                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+                     val_id += wg_size) {
+                    // get the bucket for the bit-ordered input value,
+                    // applying the offset and mask for radix bits
+                    const auto val =
+                        order_preserving_cast</*is_ascending*/ true>(
+                            proj_op(vals_ptr[val_iter_offset + val_id]));
+                    const std::uint32_t bucket_id =
+                        get_bucket_id<radix_mask>(val, radix_offset);
+
+                    // increment counter for this bit bucket
+                    ++counts_arr[bucket_id];
+                }
+            }
+            else {
+                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+                     val_id += wg_size) {
+                    // get the bucket for the bit-ordered input value,
+                    // applying the offset and mask for radix bits
+                    const auto val =
+                        order_preserving_cast</*is_ascending*/ false>(
+                            proj_op(vals_ptr[val_iter_offset + val_id]));
+                    const std::uint32_t bucket_id =
+                        get_bucket_id<radix_mask>(val, radix_offset);
+
+                    // increment counter for this bit bucket
+                    ++counts_arr[bucket_id];
+                }
+            }
+
+            // count per work-item: write private count array to local count
+            // array counts_lacc is concatenation of private count arrays from
+            // each work-item in the order of their local ids
+            const std::uint32_t count_start_id = radix_states * lid;
+            for (std::uint32_t radix_state_id = 0;
+                 radix_state_id < radix_states; ++radix_state_id) {
+                counts_lacc[count_start_id + radix_state_id] =
+                    counts_arr[radix_state_id];
+            }
+
+            sycl::group_barrier(ndit.get_group());
+
+            // count per work-group: reduce till count_lacc[] size > wg_size
+            // all work-items in the work-group do the work.
+            for (std::uint32_t i = 1; i < radix_states; ++i) {
+                // Since we interested in computing total count over work-group
+                // for each radix state, the correct result is only assured if
+                // wg_size >= radix_states
+                counts_lacc[lid] += counts_lacc[wg_size * i + lid];
+            }
+
+            sycl::group_barrier(ndit.get_group());
+
+            // count per work-group: reduce until count_lacc[] size >
+            // radix_states (n_witems /= 2 per iteration)
+            for (std::uint32_t n_witems = (wg_size >> 1);
+                 n_witems >= radix_states; n_witems >>= 1) {
+                if (lid < n_witems)
+                    counts_lacc[lid] += counts_lacc[n_witems + lid];
+
+                sycl::group_barrier(ndit.get_group());
+            }
+
+            const std::size_t iter_counter_offset = iter_id * n_counts;
+
+            // count per work-group: write local count array to global count
+            // array
+            if (lid < radix_states) {
+                // move buckets with the same id to adjacent positions,
+                // thus splitting count array into radix_states regions
+                counts_ptr[iter_counter_offset + (n_segments + 1) * lid +
+                           wgr_id] = counts_lacc[lid];
+            }
+
+            // side work: reset 'no-operation-flag', signaling to skip re-order
+            // phase
+            if (wgr_id == 0 && lid == 0) {
+                CountT &no_op_flag =
+                    counts_ptr[iter_counter_offset + no_op_flag_id];
+                no_op_flag = 0;
+            }
+        });
+    });
+
+    return local_count_ev;
+}
+
+//-----------------------------------------------------------------------
+// radix sort: scan kernel (single iteration)
+//-----------------------------------------------------------------------
+
+template <typename KernelName, std::uint32_t radix_bits, typename CountT>
+sycl::event radix_sort_scan_submit(sycl::queue &exec_q,
+                                   std::size_t n_iters,
+                                   std::size_t n_segments,
+                                   std::size_t wg_size,
+                                   std::size_t n_values,
+                                   std::size_t n_counts,
+                                   CountT *counts_ptr,
+                                   const std::vector<sycl::event> depends)
+{
+    const std::size_t no_op_flag_id = n_counts - 1;
+
+    // Scan produces local offsets using count values.
+    // There are no local offsets for the first segment, but the rest segments
+    // should be scanned with respect to the count value in the first segment
+    // what requires n + 1 positions
+    const std::size_t scan_size = n_segments + 1;
+    wg_size = std::min(scan_size, wg_size);
+
+    static constexpr std::uint32_t radix_states = std::uint32_t(1)
+                                                  << radix_bits;
+
+    // compilation of the kernel prevents out of resources issue, which may
+    // occur due to usage of collective algorithms such as joint_exclusive_scan
+    // even if local memory is not explicitly requested
+    sycl::event scan_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::nd_range<1> ndRange(n_iters * radix_states * wg_size, wg_size);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
+            const std::size_t group_id = ndit.get_group(0);
+            const std::size_t iter_id = group_id / radix_states;
+            const std::size_t wgr_id = group_id - iter_id * radix_states;
+            // find borders of a region with a specific bucket id
+            auto begin_ptr =
+                counts_ptr + scan_size * wgr_id + iter_id * n_counts;
+
+            sycl::joint_exclusive_scan(ndit.get_group(), begin_ptr,
+                                       begin_ptr + scan_size, begin_ptr,
+                                       CountT(0), sycl::plus<CountT>{});
+
+            const auto lid = ndit.get_local_linear_id();
+
+            // NB: No race condition here, because the condition may ever be
+            // true for only on one WG, one WI.
+            if ((lid == wg_size - 1) &&
+                (begin_ptr[scan_size - 1] == n_values)) {
+                // set flag, since all the values got into one
+                // this is optimization, may happy often for
+                // higher radix offsets (all zeros)
+                auto &no_op_flag =
+                    counts_ptr[iter_id * n_counts + no_op_flag_id];
+                no_op_flag = 1;
+            }
+        });
+    });
+
+    return scan_ev;
+}
+
+//-----------------------------------------------------------------------
+// radix sort: group level reorder algorithms
+//-----------------------------------------------------------------------
+
+struct empty_storage
+{
+    template <typename... T>
+    empty_storage(T &&...)
+    {
+    }
+};
+
+// Number with `n` least significant bits of uint32_t
+inline std::uint32_t n_ls_bits_set(std::uint32_t n) noexcept
+{
+    static constexpr std::uint32_t zero{};
+    static constexpr std::uint32_t all_bits_set = ~zero;
+
+    return ~(all_bits_set << n);
+}
+
+enum class peer_prefix_algo
+{
+    subgroup_ballot,
+    atomic_fetch_or,
+    scan_then_broadcast
+};
+
+template <typename OffsetT, peer_prefix_algo Algo>
+struct peer_prefix_helper;
+
+template <typename AccT>
+auto get_accessor_pointer(const AccT &acc)
+{
+    return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
+}
+
+template <typename OffsetT>
+struct peer_prefix_helper<OffsetT, peer_prefix_algo::atomic_fetch_or>
+{
+    using AtomicT = sycl::atomic_ref<std::uint32_t,
+                                     sycl::memory_order::relaxed,
+                                     sycl::memory_scope::work_group,
+                                     sycl::access::address_space::local_space>;
+    using TempStorageT = sycl::local_accessor<std::uint32_t, 1>;
+
+private:
+    sycl::sub_group sgroup;
+    std::uint32_t lid;
+    std::uint32_t item_mask;
+    AtomicT atomic_peer_mask;
+
+public:
+    peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT lacc)
+        : sgroup(ndit.get_sub_group()), lid(ndit.get_local_linear_id()),
+          item_mask(n_ls_bits_set(lid)), atomic_peer_mask(lacc[0])
+    {
+    }
+
+    std::uint32_t peer_contribution(OffsetT &new_offset_id,
+                                    OffsetT offset_prefix,
+                                    bool wi_bit_set) const
+    {
+        // reset mask for each radix state
+        if (lid == 0)
+            atomic_peer_mask.store(std::uint32_t{0});
+        sycl::group_barrier(sgroup);
+
+        const std::uint32_t uint_contrib{wi_bit_set ? std::uint32_t{1}
+                                                    : std::uint32_t{0}};
+
+        // set local id's bit to 1 if the bucket value matches the radix state
+        atomic_peer_mask.fetch_or(uint_contrib << lid);
+        sycl::group_barrier(sgroup);
+        std::uint32_t peer_mask_bits = atomic_peer_mask.load();
+        std::uint32_t sg_total_offset = sycl::popcount(peer_mask_bits);
+
+        // get the local offset index from the bits set in the peer mask with
+        // index less than the work item ID
+        peer_mask_bits &= item_mask;
+        new_offset_id |= wi_bit_set
+                             ? (offset_prefix + sycl::popcount(peer_mask_bits))
+                             : OffsetT{0};
+        return sg_total_offset;
+    }
+};
+
+template <typename OffsetT>
+struct peer_prefix_helper<OffsetT, peer_prefix_algo::scan_then_broadcast>
+{
+    using TempStorageT = empty_storage;
+    using ItemType = sycl::nd_item<1>;
+    using SubGroupType = sycl::sub_group;
+
+private:
+    SubGroupType sgroup;
+    std::uint32_t sg_size;
+
+public:
+    peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT)
+        : sgroup(ndit.get_sub_group()), sg_size(sgroup.get_local_range()[0])
+    {
+    }
+
+    std::uint32_t peer_contribution(OffsetT &new_offset_id,
+                                    OffsetT offset_prefix,
+                                    bool wi_bit_set) const
+    {
+        const std::uint32_t contrib{wi_bit_set ? std::uint32_t{1}
+                                               : std::uint32_t{0}};
+
+        std::uint32_t sg_item_offset = sycl::exclusive_scan_over_group(
+            sgroup, contrib, sycl::plus<std::uint32_t>{});
+
+        new_offset_id |=
+            (wi_bit_set ? (offset_prefix + sg_item_offset) : OffsetT(0));
+
+        // the last scanned value does not contain number of all copies, thus
+        // adding contribution
+        std::uint32_t sg_total_offset = sycl::group_broadcast(
+            sgroup, sg_item_offset + contrib, sg_size - 1);
+
+        return sg_total_offset;
+    }
+};
+
+template <typename OffsetT>
+struct peer_prefix_helper<OffsetT, peer_prefix_algo::subgroup_ballot>
+{
+private:
+    sycl::sub_group sgroup;
+    std::uint32_t lid;
+    sycl::ext::oneapi::sub_group_mask item_sg_mask;
+
+    sycl::ext::oneapi::sub_group_mask mask_builder(std::uint32_t mask,
+                                                   std::uint32_t sg_size)
+    {
+        return sycl::detail::Builder::createSubGroupMask<
+            sycl::ext::oneapi::sub_group_mask>(mask, sg_size);
+    }
+
+public:
+    using TempStorageT = empty_storage;
+
+    peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT)
+        : sgroup(ndit.get_sub_group()), lid(ndit.get_local_linear_id()),
+          item_sg_mask(
+              mask_builder(n_ls_bits_set(lid), sgroup.get_local_linear_range()))
+    {
+    }
+
+    std::uint32_t peer_contribution(OffsetT &new_offset_id,
+                                    OffsetT offset_prefix,
+                                    bool wi_bit_set) const
+    {
+        // set local id's bit to 1 if the bucket value matches the radix state
+        auto peer_mask = sycl::ext::oneapi::group_ballot(sgroup, wi_bit_set);
+        std::uint32_t peer_mask_bits{};
+
+        peer_mask.extract_bits(peer_mask_bits);
+        std::uint32_t sg_total_offset = sycl::popcount(peer_mask_bits);
+
+        // get the local offset index from the bits set in the peer mask with
+        // index less than the work item ID
+        peer_mask &= item_sg_mask;
+        peer_mask.extract_bits(peer_mask_bits);
+
+        new_offset_id |= wi_bit_set
+                             ? (offset_prefix + sycl::popcount(peer_mask_bits))
+                             : OffsetT(0);
+
+        return sg_total_offset;
+    }
+};
+
+template <typename InputT, typename OutputT>
+void copy_func_for_radix_sort(const std::size_t n_segments,
+                              const std::size_t elems_per_segment,
+                              const std::size_t sg_size,
+                              const std::uint32_t lid,
+                              const std::size_t wgr_id,
+                              const InputT *input_ptr,
+                              const std::size_t n_values,
+                              OutputT *output_ptr)
+{
+    // item info
+    const std::size_t seg_start = elems_per_segment * wgr_id;
+
+    std::size_t seg_end = sycl::min(seg_start + elems_per_segment, n_values);
+
+    // ensure that each work item in a subgroup does the same number of loop
+    // iterations
+    const std::uint16_t tail_size = (seg_end - seg_start) % sg_size;
+    seg_end -= tail_size;
+
+    // find offsets for the same values within a segment and fill the resulting
+    // buffer
+    for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+         val_id += sg_size) {
+        output_ptr[val_id] = std::move(input_ptr[val_id]);
+    }
+
+    if (tail_size > 0 && lid < tail_size) {
+        const std::size_t val_id = seg_end + lid;
+        output_ptr[val_id] = std::move(input_ptr[val_id]);
+    }
+}
+
+//-----------------------------------------------------------------------
+// radix sort: reorder kernel (per iteration)
+//-----------------------------------------------------------------------
+template <typename KernelName,
+          std::uint32_t radix_bits,
+          peer_prefix_algo PeerAlgo,
+          typename InputT,
+          typename OutputT,
+          typename OffsetT,
+          typename ProjT>
+sycl::event
+    radix_sort_reorder_submit(sycl::queue &exec_q,
+                              std::size_t n_iters,
+                              std::size_t n_segments,
+                              std::uint32_t radix_offset,
+                              std::size_t n_values,
+                              const InputT *input_ptr,
+                              OutputT *output_ptr,
+                              std::size_t n_offsets,
+                              OffsetT *offset_ptr,
+                              const ProjT &proj_op,
+                              const bool is_ascending,
+                              const std::vector<sycl::event> dependency_events)
+{
+    using ValueT = InputT;
+    using PeerHelper = peer_prefix_helper<OffsetT, PeerAlgo>;
+
+    static constexpr std::uint32_t radix_states = std::uint32_t{1}
+                                                  << radix_bits;
+    static constexpr std::uint32_t radix_mask = radix_states - 1;
+    const std::size_t elems_per_segment =
+        (n_values + n_segments - 1) / n_segments;
+
+    const std::size_t no_op_flag_id = n_offsets - 1;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = exec_q.get_context();
+    auto const &dev = exec_q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    sycl::event reorder_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependency_events);
+        cgh.use_kernel_bundle(kb);
+
+        using StorageT = typename PeerHelper::TempStorageT;
+
+        StorageT peer_temp(1, cgh);
+
+        sycl::range<1> lRange{sg_size};
+        sycl::range<1> gRange{n_iters * n_segments * sg_size};
+
+        sycl::nd_range<1> ndRange{gRange, lRange};
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
+            const std::size_t group_id = ndit.get_group(0);
+            const std::size_t iter_id = group_id / n_segments;
+            const std::size_t segment_id = group_id - iter_id * n_segments;
+
+            auto b_offset_ptr = offset_ptr + iter_id * n_offsets;
+            auto b_input_ptr = input_ptr + iter_id * n_values;
+            auto b_output_ptr = output_ptr + iter_id * n_values;
+
+            const std::uint32_t lid = ndit.get_local_id(0);
+
+            auto &no_op_flag = b_offset_ptr[no_op_flag_id];
+            if (no_op_flag) {
+                // no reordering necessary, simply copy
+                copy_func_for_radix_sort<InputT, OutputT>(
+                    n_segments, elems_per_segment, sg_size, lid, segment_id,
+                    b_input_ptr, n_values, b_output_ptr);
+                return;
+            }
+
+            // create a private array for storing offset values
+            // and add total offset and offset for compute unit
+            // for a certain radix state
+            std::array<OffsetT, radix_states> offset_arr{};
+            const std::size_t scan_size = n_segments + 1;
+
+            OffsetT scanned_bin = 0;
+
+            /* find cumulative offset */
+            static constexpr std::uint32_t zero_radix_state_id = 0;
+            offset_arr[zero_radix_state_id] = b_offset_ptr[segment_id];
+
+            for (std::uint32_t radix_state_id = 1;
+                 radix_state_id < radix_states; ++radix_state_id) {
+                const std::uint32_t local_offset_id =
+                    segment_id + scan_size * radix_state_id;
+
+                // scan bins serially
+                const std::size_t last_segment_bucket_id =
+                    radix_state_id * scan_size - 1;
+                scanned_bin += b_offset_ptr[last_segment_bucket_id];
+
+                offset_arr[radix_state_id] =
+                    scanned_bin + b_offset_ptr[local_offset_id];
+            }
+
+            const std::size_t seg_start = elems_per_segment * segment_id;
+            std::size_t seg_end =
+                sycl::min(seg_start + elems_per_segment, n_values);
+            // ensure that each work item in a subgroup does the same number of
+            // loop iterations
+            const std::uint32_t tail_size = (seg_end - seg_start) % sg_size;
+            seg_end -= tail_size;
+
+            const PeerHelper peer_prefix_hlp(ndit, peer_temp);
+
+            // find offsets for the same values within a segment and fill the
+            // resulting buffer
+            if (is_ascending) {
+                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+                     val_id += sg_size) {
+                    ValueT in_val = std::move(b_input_ptr[val_id]);
+
+                    // get the bucket for the bit-ordered input value, applying
+                    // the offset and mask for radix bits
+                    const auto mapped_val =
+                        order_preserving_cast</*is_ascending*/ true>(
+                            proj_op(in_val));
+                    std::uint32_t bucket_id =
+                        get_bucket_id<radix_mask>(mapped_val, radix_offset);
+
+                    OffsetT new_offset_id = 0;
+                    for (std::uint32_t radix_state_id = 0;
+                         radix_state_id < radix_states; ++radix_state_id) {
+                        bool is_current_bucket = (bucket_id == radix_state_id);
+                        std::uint32_t sg_total_offset =
+                            peer_prefix_hlp.peer_contribution(
+                                /* modified by reference */ new_offset_id,
+                                offset_arr[radix_state_id],
+                                /* bit contribution from this work-item */
+                                is_current_bucket);
+                        offset_arr[radix_state_id] += sg_total_offset;
+                    }
+                    b_output_ptr[new_offset_id] = std::move(in_val);
+                }
+            }
+            else {
+                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
+                     val_id += sg_size) {
+                    ValueT in_val = std::move(b_input_ptr[val_id]);
+
+                    // get the bucket for the bit-ordered input value, applying
+                    // the offset and mask for radix bits
+                    const auto mapped_val =
+                        order_preserving_cast</*is_ascending*/ false>(
+                            proj_op(in_val));
+                    std::uint32_t bucket_id =
+                        get_bucket_id<radix_mask>(mapped_val, radix_offset);
+
+                    OffsetT new_offset_id = 0;
+                    for (std::uint32_t radix_state_id = 0;
+                         radix_state_id < radix_states; ++radix_state_id) {
+                        bool is_current_bucket = (bucket_id == radix_state_id);
+                        std::uint32_t sg_total_offset =
+                            peer_prefix_hlp.peer_contribution(
+                                /* modified by reference */ new_offset_id,
+                                offset_arr[radix_state_id],
+                                /* bit contribution from this work-item */
+                                is_current_bucket);
+                        offset_arr[radix_state_id] += sg_total_offset;
+                    }
+                    b_output_ptr[new_offset_id] = std::move(in_val);
+                }
+            }
+            if (tail_size > 0) {
+                ValueT in_val;
+
+                // default: is greater than any actual radix state
+                std::uint32_t bucket_id = radix_states;
+                if (lid < tail_size) {
+                    in_val = std::move(b_input_ptr[seg_end + lid]);
+
+                    const auto proj_val = proj_op(in_val);
+                    const auto mapped_val =
+                        (is_ascending)
+                            ? order_preserving_cast</*is_ascending*/ true>(
+                                  proj_val)
+                            : order_preserving_cast</*is_ascending*/ false>(
+                                  proj_val);
+                    bucket_id =
+                        get_bucket_id<radix_mask>(mapped_val, radix_offset);
+                }
+
+                OffsetT new_offset_id = 0;
+                for (std::uint32_t radix_state_id = 0;
+                     radix_state_id < radix_states; ++radix_state_id) {
+                    bool is_current_bucket = (bucket_id == radix_state_id);
+                    std::uint32_t sg_total_offset =
+                        peer_prefix_hlp.peer_contribution(
+                            new_offset_id, offset_arr[radix_state_id],
+                            is_current_bucket);
+
+                    offset_arr[radix_state_id] += sg_total_offset;
+                }
+
+                if (lid < tail_size) {
+                    b_output_ptr[new_offset_id] = std::move(in_val);
+                }
+            }
+        });
+    });
+
+    return reorder_ev;
+}
+
+template <typename sizeT>
+sizeT _slm_adjusted_work_group_size(sycl::queue &exec_q,
+                                    sizeT required_slm_bytes_per_wg,
+                                    sizeT wg_size)
+{
+    const auto &dev = exec_q.get_device();
+
+    if (wg_size == 0)
+        wg_size =
+            dev.template get_info<sycl::info::device::max_work_group_size>();
+
+    const auto local_mem_sz =
+        dev.template get_info<sycl::info::device::local_mem_size>();
+
+    return sycl::min(local_mem_sz / required_slm_bytes_per_wg, wg_size);
+}
+
+//-----------------------------------------------------------------------
+// radix sort: one iteration
+//-----------------------------------------------------------------------
+
+template <std::uint32_t radix_bits, bool even>
+struct parallel_radix_sort_iteration_step
+{
+    template <typename... Name>
+    using count_phase = radix_sort_count_kernel<radix_bits, even, Name...>;
+    template <typename... Name>
+    using local_scan_phase = radix_sort_scan_kernel<radix_bits, Name...>;
+    template <typename... Name>
+    using reorder_peer_phase =
+        radix_sort_reorder_peer_kernel<radix_bits, even, Name...>;
+    template <typename... Name>
+    using reorder_phase = radix_sort_reorder_kernel<radix_bits, even, Name...>;
+
+    template <typename InputT,
+              typename OutputT,
+              typename CountT,
+              typename ProjT>
+    static sycl::event submit(sycl::queue &exec_q,
+                              std::size_t n_iters,
+                              std::size_t n_segments,
+                              std::uint32_t radix_iter,
+                              std::size_t n_values,
+                              const InputT *in_ptr,
+                              OutputT *out_ptr,
+                              std::size_t n_counts,
+                              CountT *counts_ptr,
+                              const ProjT &proj_op,
+                              const bool is_ascending,
+                              const std::vector<sycl::event> &dependency_events)
+    {
+        using _RadixCountKernel = count_phase<InputT, OutputT, CountT, ProjT>;
+        using _RadixLocalScanKernel =
+            local_scan_phase<InputT, OutputT, CountT, ProjT>;
+        using _RadixReorderPeerKernel =
+            reorder_peer_phase<InputT, OutputT, CountT, ProjT>;
+        using _RadixReorderKernel =
+            reorder_phase<InputT, OutputT, CountT, ProjT>;
+
+        const auto &supported_sub_group_sizes =
+            exec_q.get_device()
+                .template get_info<sycl::info::device::sub_group_sizes>();
+        const std::size_t max_sg_size =
+            (supported_sub_group_sizes.empty()
+                 ? 0
+                 : supported_sub_group_sizes.back());
+        const std::size_t reorder_sg_size = max_sg_size;
+        const std::size_t scan_wg_size =
+            exec_q.get_device()
+                .template get_info<sycl::info::device::max_work_group_size>();
+
+        static constexpr std::size_t two_mils = (std::size_t(1) << 21);
+        std::size_t count_wg_size =
+            ((max_sg_size > 0) && (n_values > two_mils) ? 128 : max_sg_size);
+
+        static constexpr std::uint32_t radix_states = std::uint32_t(1)
+                                                      << radix_bits;
+
+        // correct count_wg_size according to local memory limit in count phase
+        const auto max_count_wg_size = _slm_adjusted_work_group_size(
+            exec_q, sizeof(CountT) * radix_states, count_wg_size);
+        count_wg_size =
+            static_cast<::std::size_t>((max_count_wg_size / radix_states)) *
+            radix_states;
+
+        // work-group size must be a power of 2 and not less than the number of
+        // states, for scanning to work correctly
+
+        const std::size_t rounded_down_count_wg_size =
+            std::size_t{1} << (number_of_bits_in_type<std::size_t>() -
+                               sycl::clz(count_wg_size) - 1);
+        count_wg_size =
+            sycl::max(rounded_down_count_wg_size, std::size_t(radix_states));
+
+        // Compute the radix position for the given iteration
+        std::uint32_t radix_offset = radix_iter * radix_bits;
+
+        // 1. Count Phase
+        sycl::event count_ev =
+            radix_sort_count_submit<_RadixCountKernel, radix_bits>(
+                exec_q, n_iters, n_segments, count_wg_size, radix_offset,
+                n_values, in_ptr, n_counts, counts_ptr, proj_op, is_ascending,
+                dependency_events);
+
+        // 2. Scan Phase
+        sycl::event scan_ev =
+            radix_sort_scan_submit<_RadixLocalScanKernel, radix_bits>(
+                exec_q, n_iters, n_segments, scan_wg_size, n_values, n_counts,
+                counts_ptr, {count_ev});
+
+        // 3. Reorder Phase
+        sycl::event reorder_ev{};
+        // subgroup_ballot-based peer algo uses extract_bits to populate
+        // uint32_t mask and hence relies on sub-group to be 32 or narrower
+        static constexpr std::size_t sg32_v = 32u;
+        static constexpr std::size_t sg16_v = 16u;
+        static constexpr std::size_t sg08_v = 8u;
+        if (sg32_v == reorder_sg_size || sg16_v == reorder_sg_size ||
+            sg08_v == reorder_sg_size) {
+            static constexpr auto peer_algorithm =
+                peer_prefix_algo::subgroup_ballot;
+
+            reorder_ev = radix_sort_reorder_submit<_RadixReorderPeerKernel,
+                                                   radix_bits, peer_algorithm>(
+                exec_q, n_iters, n_segments, radix_offset, n_values, in_ptr,
+                out_ptr, n_counts, counts_ptr, proj_op, is_ascending,
+                {scan_ev});
+        }
+        else {
+            static constexpr auto peer_algorithm =
+                peer_prefix_algo::scan_then_broadcast;
+
+            reorder_ev = radix_sort_reorder_submit<_RadixReorderKernel,
+                                                   radix_bits, peer_algorithm>(
+                exec_q, n_iters, n_segments, radix_offset, n_values, in_ptr,
+                out_ptr, n_counts, counts_ptr, proj_op, is_ascending,
+                {scan_ev});
+        }
+
+        return reorder_ev;
+    }
+}; // struct parallel_radix_sort_iteration
+
+template <typename Names, std::uint16_t... Constants>
+class radix_sort_one_wg_krn;
+
+template <typename KernelNameBase,
+          std::uint16_t wg_size = 256,
+          std::uint16_t block_size = 16,
+          std::uint32_t radix = 4,
+          std::uint16_t req_sub_group_size = (block_size < 4 ? 32 : 16)>
+struct subgroup_radix_sort
+{
+private:
+    class use_slm_tag
+    {
+    };
+    class use_global_mem_tag
+    {
+    };
+
+public:
+    template <typename ValueT, typename OutputT, typename ProjT>
+    sycl::event operator()(sycl::queue &exec_q,
+                           std::size_t n_iters,
+                           std::size_t n_to_sort,
+                           ValueT *input_ptr,
+                           OutputT *output_ptr,
+                           ProjT proj_op,
+                           const bool is_ascending,
+                           const std::vector<sycl::event> &depends)
+    {
+        static_assert(std::is_same_v<std::remove_cv_t<ValueT>, OutputT>);
+
+        using _SortKernelLoc =
+            radix_sort_one_wg_krn<KernelNameBase, wg_size, block_size, 0>;
+        using _SortKernelPartGlob =
+            radix_sort_one_wg_krn<KernelNameBase, wg_size, block_size, 1>;
+        using _SortKernelGlob =
+            radix_sort_one_wg_krn<KernelNameBase, wg_size, block_size, 2>;
+
+        static constexpr std::size_t max_concurrent_work_groups = 128U;
+
+        // Choose this to occupy the entire accelerator
+        const std::size_t n_work_groups =
+            std::min<std::size_t>(n_iters, max_concurrent_work_groups);
+
+        // determine which temporary allocation can be accommodated in SLM
+        const auto &SLM_availability =
+            check_slm_size<ValueT>(exec_q, n_to_sort);
+
+        const std::size_t n_batch_size = n_work_groups;
+
+        switch (SLM_availability) {
+        case temp_allocations::both_in_slm:
+        {
+            static constexpr auto storage_for_values = use_slm_tag{};
+            static constexpr auto storage_for_counters = use_slm_tag{};
+
+            return one_group_submitter<_SortKernelLoc>()(
+                exec_q, n_iters, n_iters, n_to_sort, input_ptr, output_ptr,
+                proj_op, is_ascending, storage_for_values, storage_for_counters,
+                depends);
+        }
+        case temp_allocations::counters_in_slm:
+        {
+            static constexpr auto storage_for_values = use_global_mem_tag{};
+            static constexpr auto storage_for_counters = use_slm_tag{};
+
+            return one_group_submitter<_SortKernelPartGlob>()(
+                exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr,
+                proj_op, is_ascending, storage_for_values, storage_for_counters,
+                depends);
+        }
+        default:
+        {
+            static constexpr auto storage_for_values = use_global_mem_tag{};
+            static constexpr auto storage_for_counters = use_global_mem_tag{};
+
+            return one_group_submitter<_SortKernelGlob>()(
+                exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr,
+                proj_op, is_ascending, storage_for_values, storage_for_counters,
+                depends);
+        }
+        }
+    }
+
+private:
+    template <typename KeyT, typename>
+    class TempBuf;
+
+    template <typename KeyT>
+    class TempBuf<KeyT, use_slm_tag>
+    {
+        std::size_t buf_size;
+
+    public:
+        TempBuf(std::size_t, std::size_t n) : buf_size(n) {}
+        auto get_acc(sycl::handler &cgh)
+        {
+            return sycl::local_accessor<KeyT>(buf_size, cgh);
+        }
+
+        std::size_t get_iter_stride() const { return std::size_t{0}; }
+    };
+
+    template <typename KeyT>
+    class TempBuf<KeyT, use_global_mem_tag>
+    {
+        sycl::buffer<KeyT> buf;
+        std::size_t iter_stride;
+
+    public:
+        TempBuf(std::size_t n_iters, std::size_t n)
+            : buf(n_iters * n), iter_stride(n)
+        {
+        }
+        auto get_acc(sycl::handler &cgh)
+        {
+            return sycl::accessor(buf, cgh, sycl::read_write, sycl::no_init);
+        }
+        std::size_t get_iter_stride() const { return iter_stride; }
+    };
+
+    static_assert(wg_size <= 1024);
+    static constexpr std::uint16_t bin_count = (1 << radix);
+    static constexpr std::uint16_t counter_buf_sz = wg_size * bin_count + 1;
+
+    enum class temp_allocations
+    {
+        both_in_slm,
+        counters_in_slm,
+        both_in_global_mem
+    };
+
+    template <typename T, typename SizeT>
+    temp_allocations check_slm_size(const sycl::queue &exec_q, SizeT n)
+    {
+        // the kernel is designed for data size <= 64K
+        assert(n <= (SizeT(1) << 16));
+
+        static constexpr auto req_slm_size_counters =
+            counter_buf_sz * sizeof(std::uint16_t);
+
+        const auto &dev = exec_q.get_device();
+
+        // Pessimistically only use half of the memory to take into account
+        // a SYCL group algorithm might use a portion of SLM
+        const std::size_t max_slm_size =
+            dev.template get_info<sycl::info::device::local_mem_size>() / 2;
+
+        const auto n_uniform = 1 << ceil_log2(n);
+        const auto req_slm_size_val = sizeof(T) * n_uniform;
+
+        return ((req_slm_size_val + req_slm_size_counters) <= max_slm_size)
+                   ?
+                   // the values and the counters are placed in SLM
+                   temp_allocations::both_in_slm
+                   : (req_slm_size_counters <= max_slm_size)
+                         ?
+                         // the counters are placed in SLM, the values - in the
+                         // global memory
+                         temp_allocations::counters_in_slm
+                         :
+                         // the values and the counters are placed in the global
+                         // memory
+                         temp_allocations::both_in_global_mem;
+    }
+
+    template <typename KernelName>
+    struct one_group_submitter
+    {
+        template <typename InputT,
+                  typename OutputT,
+                  typename ProjT,
+                  typename SLM_value_tag,
+                  typename SLM_counter_tag>
+        sycl::event operator()(sycl::queue &exec_q,
+                               std::size_t n_iters,
+                               std::size_t n_batch_size,
+                               std::size_t n_values,
+                               InputT *input_arr,
+                               OutputT *output_arr,
+                               const ProjT &proj_op,
+                               const bool is_ascending,
+                               SLM_value_tag,
+                               SLM_counter_tag,
+                               const std::vector<sycl::event> &depends)
+        {
+            assert(!(n_values >> 16));
+
+            assert(n_values <= static_cast<std::size_t>(block_size) *
+                                   static_cast<std::size_t>(wg_size));
+
+            const std::uint16_t n = static_cast<std::uint16_t>(n_values);
+            static_assert(std::is_same_v<std::remove_cv_t<InputT>, OutputT>);
+
+            using ValueT = OutputT;
+
+            using KeyT = std::invoke_result_t<ProjT, ValueT>;
+
+            TempBuf<ValueT, SLM_value_tag> buf_val(
+                n_batch_size, static_cast<std::size_t>(block_size * wg_size));
+            TempBuf<std::uint16_t, SLM_counter_tag> buf_count(
+                n_batch_size, static_cast<std::size_t>(counter_buf_sz));
+
+            sycl::range<1> lRange{wg_size};
+
+            sycl::event sort_ev;
+            std::vector<sycl::event> deps{depends};
+
+            const std::size_t n_batches =
+                (n_iters + n_batch_size - 1) / n_batch_size;
+
+            const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+            auto const &ctx = exec_q.get_context();
+            auto const &dev = exec_q.get_device();
+            auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+                ctx, {dev}, {kernel_id});
+
+            const auto &krn = kb.get_kernel(kernel_id);
+
+            const std::uint32_t krn_sg_size = krn.template get_info<
+                sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+            // due to a bug in CPU device implementation, an additional
+            // synchronization is necessary for short sub-group sizes
+            const bool work_around_needed =
+                exec_q.get_device().has(sycl::aspect::cpu) &&
+                (krn_sg_size < 16);
+
+            for (std::size_t batch_id = 0; batch_id < n_batches; ++batch_id) {
+
+                const std::size_t block_start = batch_id * n_batch_size;
+
+                // input_arr/output_arr each has shape (n_iters, n)
+                InputT *this_input_arr = input_arr + block_start * n_values;
+                OutputT *this_output_arr = output_arr + block_start * n_values;
+
+                const std::size_t block_end =
+                    std::min<std::size_t>(block_start + n_batch_size, n_iters);
+
+                sycl::range<1> gRange{(block_end - block_start) * wg_size};
+                sycl::nd_range ndRange{gRange, lRange};
+
+                sort_ev = exec_q.submit([&](sycl::handler &cgh) {
+                    cgh.depends_on(deps);
+                    cgh.use_kernel_bundle(kb);
+
+                    // allocation to use for value exchanges
+                    auto exchange_acc = buf_val.get_acc(cgh);
+                    const std::size_t exchange_acc_iter_stride =
+                        buf_val.get_iter_stride();
+
+                    // allocation for counters
+                    auto counter_acc = buf_count.get_acc(cgh);
+                    const std::size_t counter_acc_iter_stride =
+                        buf_count.get_iter_stride();
+
+                    cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1>
+                                                                  ndit) {
+                        ValueT values[block_size];
+
+                        const std::size_t iter_id = ndit.get_group(0);
+                        const std::size_t iter_val_offset =
+                            iter_id * static_cast<std::size_t>(n);
+                        const std::size_t iter_counter_offset =
+                            iter_id * counter_acc_iter_stride;
+                        const std::size_t iter_exchange_offset =
+                            iter_id * exchange_acc_iter_stride;
+
+                        std::uint16_t wi = ndit.get_local_linear_id();
+                        std::uint16_t begin_bit = 0;
+
+                        static constexpr std::uint16_t end_bit =
+                            number_of_bits_in_type<KeyT>();
+
+                        // copy from input array into values
+#pragma unroll
+                        for (std::uint16_t i = 0; i < block_size; ++i) {
+                            const std::uint16_t id = wi * block_size + i;
+                            values[i] =
+                                (id < n) ? this_input_arr[iter_val_offset + id]
+                                         : ValueT{};
+                        }
+
+                        while (true) {
+                            // indices for indirect access in the "re-order"
+                            // phase
+                            std::uint16_t indices[block_size];
+                            {
+                                // pointers to bucket's counters
+                                std::uint16_t *counters[block_size];
+
+                                // counting phase
+                                auto pcounter =
+                                    get_accessor_pointer(counter_acc) +
+                                    (wi + iter_counter_offset);
+
+                                // initialize counters
+#pragma unroll
+                                for (std::uint16_t i = 0; i < bin_count; ++i)
+                                    pcounter[i * wg_size] = std::uint16_t{0};
+
+                                sycl::group_barrier(ndit.get_group());
+
+                                if (is_ascending) {
+#pragma unroll
+                                    for (std::uint16_t i = 0; i < block_size;
+                                         ++i) {
+                                        const std::uint16_t id =
+                                            wi * block_size + i;
+                                        static constexpr std::uint16_t
+                                            bin_mask = bin_count - 1;
+
+                                        // points to the padded element, i.e. id
+                                        // is in-range
+                                        static constexpr std::uint16_t
+                                            default_out_of_range_bin_id =
+                                                bin_mask;
+
+                                        const std::uint16_t bin =
+                                            (id < n)
+                                                ? get_bucket_id<bin_mask>(
+                                                      order_preserving_cast<
+                                                          /* is_ascending */
+                                                          true>(
+                                                          proj_op(values[i])),
+                                                      begin_bit)
+                                                : default_out_of_range_bin_id;
+
+                                        // counting and local offset calculation
+                                        counters[i] = &pcounter[bin * wg_size];
+                                        indices[i] = *counters[i];
+                                        *counters[i] = indices[i] + 1;
+
+                                        if (work_around_needed) {
+                                            sycl::group_barrier(
+                                                ndit.get_group());
+                                        }
+                                    }
+                                }
+                                else {
+#pragma unroll
+                                    for (std::uint16_t i = 0; i < block_size;
+                                         ++i) {
+                                        const std::uint16_t id =
+                                            wi * block_size + i;
+                                        static constexpr std::uint16_t
+                                            bin_mask = bin_count - 1;
+
+                                        // points to the padded element, i.e. id
+                                        // is in-range
+                                        static constexpr std::uint16_t
+                                            default_out_of_range_bin_id =
+                                                bin_mask;
+
+                                        const std::uint16_t bin =
+                                            (id < n)
+                                                ? get_bucket_id<bin_mask>(
+                                                      order_preserving_cast<
+                                                          /* is_ascending */
+                                                          false>(
+                                                          proj_op(values[i])),
+                                                      begin_bit)
+                                                : default_out_of_range_bin_id;
+
+                                        // counting and local offset calculation
+                                        counters[i] = &pcounter[bin * wg_size];
+                                        indices[i] = *counters[i];
+                                        *counters[i] = indices[i] + 1;
+
+                                        if (work_around_needed) {
+                                            sycl::group_barrier(
+                                                ndit.get_group());
+                                        }
+                                    }
+                                }
+
+                                sycl::group_barrier(ndit.get_group());
+
+                                // exclusive scan phase
+                                {
+
+                                    // scan contiguous numbers
+                                    std::uint16_t bin_sum[bin_count];
+                                    const std::size_t counter_offset0 =
+                                        iter_counter_offset + wi * bin_count;
+                                    bin_sum[0] = counter_acc[counter_offset0];
+
+#pragma unroll
+                                    for (std::uint16_t i = 1; i < bin_count;
+                                         ++i)
+                                        bin_sum[i] =
+                                            bin_sum[i - 1] +
+                                            counter_acc[counter_offset0 + i];
+
+                                    sycl::group_barrier(ndit.get_group());
+
+                                    // exclusive scan local sum
+                                    std::uint16_t sum_scan =
+                                        sycl::exclusive_scan_over_group(
+                                            ndit.get_group(),
+                                            bin_sum[bin_count - 1],
+                                            sycl::plus<std::uint16_t>());
+
+// add to local sum, generate exclusive scan result
+#pragma unroll
+                                    for (std::uint16_t i = 0; i < bin_count;
+                                         ++i)
+                                        counter_acc[counter_offset0 + i + 1] =
+                                            sum_scan + bin_sum[i];
+
+                                    if (wi == 0)
+                                        counter_acc[iter_counter_offset + 0] =
+                                            std::uint32_t{0};
+
+                                    sycl::group_barrier(ndit.get_group());
+                                }
+
+#pragma unroll
+                                for (std::uint16_t i = 0; i < block_size; ++i) {
+                                    // a global index is a local offset plus a
+                                    // global base index
+                                    indices[i] += *counters[i];
+                                }
+
+                                sycl::group_barrier(ndit.get_group());
+                            }
+
+                            begin_bit += radix;
+
+                            // "re-order" phase
+                            sycl::group_barrier(ndit.get_group());
+                            if (begin_bit >= end_bit) {
+                                // the last iteration - writing out the result
+#pragma unroll
+                                for (std::uint16_t i = 0; i < block_size; ++i) {
+                                    const std::uint16_t r = indices[i];
+                                    if (r < n) {
+                                        this_output_arr[iter_val_offset + r] =
+                                            values[i];
+                                    }
+                                }
+
+                                return;
+                            }
+
+                            // data exchange
+#pragma unroll
+                            for (std::uint16_t i = 0; i < block_size; ++i) {
+                                const std::uint16_t r = indices[i];
+                                if (r < n)
+                                    exchange_acc[iter_exchange_offset + r] =
+                                        values[i];
+                            }
+
+                            sycl::group_barrier(ndit.get_group());
+
+#pragma unroll
+                            for (std::uint16_t i = 0; i < block_size; ++i) {
+                                const std::uint16_t id = wi * block_size + i;
+                                if (id < n)
+                                    values[i] =
+                                        exchange_acc[iter_exchange_offset + id];
+                            }
+
+                            sycl::group_barrier(ndit.get_group());
+                        }
+                    });
+                });
+
+                deps = {sort_ev};
+            }
+
+            return sort_ev;
+        }
+    };
+};
+
+template <typename ValueT, typename ProjT>
+struct OneWorkGroupRadixSortKernel;
+
+//-----------------------------------------------------------------------
+// radix sort: main function
+//-----------------------------------------------------------------------
+template <typename ValueT, typename ProjT>
+sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
+                                     std::size_t n_iters,
+                                     std::size_t n_to_sort,
+                                     const ValueT *input_arr,
+                                     ValueT *output_arr,
+                                     const ProjT &proj_op,
+                                     const bool is_ascending,
+                                     const std::vector<sycl::event> &depends)
+{
+    assert(n_to_sort > 1);
+
+    using KeyT = std::remove_cv_t<
+        std::remove_reference_t<std::invoke_result_t<ProjT, ValueT>>>;
+
+    // radix bits represent number of processed bits in each value during one
+    // iteration
+    static constexpr std::uint32_t radix_bits = 4;
+
+    sycl::event sort_ev{};
+
+    const auto &dev = exec_q.get_device();
+    const auto max_wg_size =
+        dev.template get_info<sycl::info::device::max_work_group_size>();
+
+    static constexpr std::uint16_t ref_wg_size = 64;
+    if (n_to_sort <= 16384 && ref_wg_size * 8 <= max_wg_size) {
+        using _RadixSortKernel = OneWorkGroupRadixSortKernel<ValueT, ProjT>;
+
+        if (n_to_sort <= 64 && ref_wg_size <= max_wg_size) {
+            // wg_size * block_size == 64 * 1 * 1 == 64
+            static constexpr std::uint16_t wg_size = ref_wg_size;
+            static constexpr std::uint16_t block_size = 1;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 128 && ref_wg_size * 2 <= max_wg_size) {
+            // wg_size * block_size == 64 * 2 * 1 == 128
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 1;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 256 && ref_wg_size * 2 <= max_wg_size) {
+            // wg_size * block_size == 64 * 2 * 2 == 256
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 2;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 512 && ref_wg_size * 2 <= max_wg_size) {
+            // wg_size * block_size == 64 * 2 * 4 == 512
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 4;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 1024 && ref_wg_size * 2 <= max_wg_size) {
+            // wg_size * block_size == 64 * 2 * 8 == 1024
+            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
+            static constexpr std::uint16_t block_size = 8;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 2048 && ref_wg_size * 4 <= max_wg_size) {
+            // wg_size * block_size == 64 * 4 * 8 == 2048
+            static constexpr std::uint16_t wg_size = ref_wg_size * 4;
+            static constexpr std::uint16_t block_size = 8;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 4096 && ref_wg_size * 4 <= max_wg_size) {
+            // wg_size * block_size == 64 * 4 * 16 == 4096
+            static constexpr std::uint16_t wg_size = ref_wg_size * 4;
+            static constexpr std::uint16_t block_size = 16;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else if (n_to_sort <= 8192 && ref_wg_size * 8 <= max_wg_size) {
+            // wg_size * block_size == 64 * 8 * 16 == 8192
+            static constexpr std::uint16_t wg_size = ref_wg_size * 8;
+            static constexpr std::uint16_t block_size = 16;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+        else {
+            // wg_size * block_size == 64 * 8 * 32 == 16384
+            static constexpr std::uint16_t wg_size = ref_wg_size * 8;
+            static constexpr std::uint16_t block_size = 32;
+
+            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
+                                          radix_bits>{}(
+                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
+                is_ascending, depends);
+        }
+    }
+    else {
+        static constexpr std::uint32_t radix_iters =
+            number_of_buckets_in_type<KeyT>(radix_bits);
+        static constexpr std::uint32_t radix_states = std::uint32_t(1)
+                                                      << radix_bits;
+
+        static constexpr std::size_t bound_512k = (std::size_t(1) << 19);
+        static constexpr std::size_t bound_2m = (std::size_t(1) << 21);
+
+        const auto wg_sz_k = (n_to_sort < bound_512k)  ? 8
+                             : (n_to_sort <= bound_2m) ? 4
+                                                       : 1;
+        const std::size_t wg_size = max_wg_size / wg_sz_k;
+
+        const std::size_t n_segments = (n_to_sort + wg_size - 1) / wg_size;
+
+        // Additional radix_states elements are used for getting local offsets
+        // from count values + no_op flag; 'No operation' flag specifies whether
+        // to skip re-order phase if the all keys are the same (lie in one bin)
+        const std::size_t n_counts =
+            (n_segments + 1) * radix_states + 1 /*no_op flag*/;
+
+        using CountT = std::uint32_t;
+
+        // memory for storing count and offset values
+        auto count_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<CountT>(
+                n_iters * n_counts, exec_q);
+
+        CountT *count_ptr = count_owner.get();
+
+        static constexpr std::uint32_t zero_radix_iter{0};
+
+        if constexpr (std::is_same_v<KeyT, bool>) {
+
+            sort_ev = parallel_radix_sort_iteration_step<
+                radix_bits, /*even=*/true>::submit(exec_q, n_iters, n_segments,
+                                                   zero_radix_iter, n_to_sort,
+                                                   input_arr, output_arr,
+                                                   n_counts, count_ptr, proj_op,
+                                                   is_ascending, depends);
+
+            sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {sort_ev}, count_owner);
+
+            return sort_ev;
+        }
+
+        auto tmp_arr_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<ValueT>(
+                n_iters * n_to_sort, exec_q);
+
+        ValueT *tmp_arr = tmp_arr_owner.get();
+
+        // iterations per each bucket
+        assert("Number of iterations must be even" && radix_iters % 2 == 0);
+        assert(radix_iters > 0);
+
+        sort_ev = parallel_radix_sort_iteration_step<
+            radix_bits, /*even=*/true>::submit(exec_q, n_iters, n_segments,
+                                               zero_radix_iter, n_to_sort,
+                                               input_arr, tmp_arr, n_counts,
+                                               count_ptr, proj_op, is_ascending,
+                                               depends);
+
+        for (std::uint32_t radix_iter = 1; radix_iter < radix_iters;
+             ++radix_iter) {
+            if (radix_iter % 2 == 0) {
+                sort_ev = parallel_radix_sort_iteration_step<
+                    radix_bits,
+                    /*even=*/true>::submit(exec_q, n_iters, n_segments,
+                                           radix_iter, n_to_sort, output_arr,
+                                           tmp_arr, n_counts, count_ptr,
+                                           proj_op, is_ascending, {sort_ev});
+            }
+            else {
+                sort_ev = parallel_radix_sort_iteration_step<
+                    radix_bits,
+                    /*even=*/false>::submit(exec_q, n_iters, n_segments,
+                                            radix_iter, n_to_sort, tmp_arr,
+                                            output_arr, n_counts, count_ptr,
+                                            proj_op, is_ascending, {sort_ev});
+            }
+        }
+
+        sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {sort_ev}, tmp_arr_owner, count_owner);
+    }
+
+    return sort_ev;
+}
+
+struct IdentityProj
+{
+    constexpr IdentityProj() {}
+
+    template <typename T>
+    constexpr T operator()(T val) const
+    {
+        return val;
+    }
+};
+
+template <typename ValueT, typename IndexT>
+struct ValueProj
+{
+    constexpr ValueProj() {}
+
+    constexpr ValueT operator()(const std::pair<ValueT, IndexT> &pair) const
+    {
+        return pair.first;
+    }
+};
+
+template <typename IndexT, typename ValueT, typename ProjT>
+struct IndexedProj
+{
+    IndexedProj(const ValueT *arg_ptr) : ptr(arg_ptr), value_projector{} {}
+
+    IndexedProj(const ValueT *arg_ptr, const ProjT &proj_op)
+        : ptr(arg_ptr), value_projector(proj_op)
+    {
+    }
+
+    auto operator()(IndexT i) const { return value_projector(ptr[i]); }
+
+private:
+    const ValueT *ptr;
+    ProjT value_projector;
+};
+
+} // namespace radix_sort_details
+
+using dpctl::tensor::ssize_t;
+
+template <typename argTy>
+sycl::event
+    radix_sort_axis1_contig_impl(sycl::queue &exec_q,
+                                 const bool sort_ascending,
+                                 // number of sub-arrays to sort (num. of rows
+                                 // in a matrix when sorting over rows)
+                                 std::size_t iter_nelems,
+                                 // size of each array to sort  (length of rows,
+                                 // i.e. number of columns)
+                                 std::size_t sort_nelems,
+                                 const char *arg_cp,
+                                 char *res_cp,
+                                 ssize_t iter_arg_offset,
+                                 ssize_t iter_res_offset,
+                                 ssize_t sort_arg_offset,
+                                 ssize_t sort_res_offset,
+                                 const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    argTy *res_tp =
+        reinterpret_cast<argTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    using Proj = radix_sort_details::IdentityProj;
+    static constexpr Proj proj_op{};
+
+    sycl::event radix_sort_ev =
+        radix_sort_details::parallel_radix_sort_impl<argTy, Proj>(
+            exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, proj_op,
+            sort_ascending, depends);
+
+    return radix_sort_ev;
+}
+
+template <typename ValueT, typename IndexT>
+class radix_argsort_index_write_out_krn;
+
+template <typename ValueT, typename IndexT>
+class radix_argsort_iota_krn;
+
+template <typename argTy, typename IndexTy>
+sycl::event
+    radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
+                                    const bool sort_ascending,
+                                    // number of sub-arrays to sort (num. of
+                                    // rows in a matrix when sorting over rows)
+                                    std::size_t iter_nelems,
+                                    // size of each array to sort  (length of
+                                    // rows, i.e. number of columns)
+                                    std::size_t sort_nelems,
+                                    const char *arg_cp,
+                                    char *res_cp,
+                                    ssize_t iter_arg_offset,
+                                    ssize_t iter_res_offset,
+                                    ssize_t sort_arg_offset,
+                                    ssize_t sort_res_offset,
+                                    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + sort_arg_offset;
+    IndexTy *res_tp =
+        reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
+
+    const std::size_t total_nelems = iter_nelems * sort_nelems;
+    auto workspace_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(total_nelems,
+                                                                 exec_q);
+
+    // get raw USM pointer
+    IndexTy *workspace = workspace_owner.get();
+
+    using IdentityProjT = radix_sort_details::IdentityProj;
+    using IndexedProjT =
+        radix_sort_details::IndexedProj<IndexTy, argTy, IdentityProjT>;
+    const IndexedProjT proj_op{arg_tp};
+
+    using IotaKernelName = radix_argsort_iota_krn<argTy, IndexTy>;
+
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, workspace, total_nelems, depends);
+
+    sycl::event radix_sort_ev =
+        radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
+            exec_q, iter_nelems, sort_nelems, workspace, res_tp, proj_op,
+            sort_ascending, {iota_ev});
+
+    using MapBackKernelName = radix_argsort_index_write_out_krn<argTy, IndexTy>;
+    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
+
+    sycl::event dep = radix_sort_ev;
+
+    // no need to perform map_back ( id % sort_nelems)
+    //   if total_nelems == sort_nelems
+    if (iter_nelems > 1u) {
+        dep = map_back_impl<MapBackKernelName, IndexTy>(
+            exec_q, total_nelems, res_tp, res_tp, sort_nelems, {dep});
+    }
+
+    sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {dep}, workspace_owner);
+
+    return cleanup_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
new file mode 100644
index 000000000000..1f3576402511
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
@@ -0,0 +1,119 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor sort/argsort operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+
+namespace dpctl::tensor::kernels::search_sorted_detail
+{
+
+template <typename T>
+T quotient_ceil(T n, T m)
+{
+    return (n + m - 1) / m;
+}
+
+template <typename Acc, typename Value, typename Compare>
+std::size_t lower_bound_impl(const Acc acc,
+                             const std::size_t first,
+                             const std::size_t last,
+                             const Value &value,
+                             const Compare &comp)
+{
+    std::size_t n = last - first;
+    std::size_t cur = n, start = first;
+    std::size_t it;
+    while (n > 0) {
+        it = start;
+        cur = n / 2;
+        it += cur;
+        if (comp(acc[it], value)) {
+            n -= cur + 1, start = ++it;
+        }
+        else
+            n = cur;
+    }
+    return start;
+}
+
+template <typename Acc, typename Value, typename Compare>
+std::size_t upper_bound_impl(const Acc acc,
+                             const std::size_t first,
+                             const std::size_t last,
+                             const Value &value,
+                             const Compare &comp)
+{
+    const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); };
+    return lower_bound_impl(acc, first, last, value, op_comp);
+}
+
+template <typename Acc, typename Value, typename Compare, typename IndexerT>
+std::size_t lower_bound_indexed_impl(const Acc acc,
+                                     std::size_t first,
+                                     std::size_t last,
+                                     const Value &value,
+                                     const Compare &comp,
+                                     const IndexerT &acc_indexer)
+{
+    std::size_t n = last - first;
+    std::size_t cur = n, start = first;
+    std::size_t it;
+    while (n > 0) {
+        it = start;
+        cur = n / 2;
+        it += cur;
+        if (comp(acc[acc_indexer(it)], value)) {
+            n -= cur + 1, start = ++it;
+        }
+        else
+            n = cur;
+    }
+    return start;
+}
+
+template <typename Acc, typename Value, typename Compare, typename IndexerT>
+std::size_t upper_bound_indexed_impl(const Acc acc,
+                                     const std::size_t first,
+                                     const std::size_t last,
+                                     const Value &value,
+                                     const Compare &comp,
+                                     const IndexerT &acc_indexer)
+{
+    const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); };
+    return lower_bound_indexed_impl(acc, first, last, value, op_comp,
+                                    acc_indexer);
+}
+
+} // namespace dpctl::tensor::kernels::search_sorted_detail
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
new file mode 100644
index 000000000000..bc400c9e569a
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
@@ -0,0 +1,258 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor sort/argsort operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "utils/offset_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+
+template <typename argTy,
+          typename indTy,
+          bool left_side,
+          typename HayIndexerT,
+          typename NeedlesIndexerT,
+          typename PositionsIndexerT,
+          typename Compare>
+struct SearchSortedFunctor
+{
+private:
+    const argTy *hay_tp;
+    const argTy *needles_tp;
+    indTy *positions_tp;
+    std::size_t hay_nelems;
+    HayIndexerT hay_indexer;
+    NeedlesIndexerT needles_indexer;
+    PositionsIndexerT positions_indexer;
+
+public:
+    SearchSortedFunctor(const argTy *hay_,
+                        const argTy *needles_,
+                        indTy *positions_,
+                        const std::size_t hay_nelems_,
+                        const HayIndexerT &hay_indexer_,
+                        const NeedlesIndexerT &needles_indexer_,
+                        const PositionsIndexerT &positions_indexer_)
+        : hay_tp(hay_), needles_tp(needles_), positions_tp(positions_),
+          hay_nelems(hay_nelems_), hay_indexer(hay_indexer_),
+          needles_indexer(needles_indexer_),
+          positions_indexer(positions_indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        const Compare comp{};
+
+        const std::size_t i = id[0];
+        const argTy needle_v = needles_tp[needles_indexer(i)];
+
+        // position of the needle_v in the hay array
+        indTy pos{};
+
+        static constexpr std::size_t zero(0);
+        if constexpr (left_side) {
+            // search in hay in left-closed interval, give `pos` such that
+            // hay[pos - 1] < needle_v <= hay[pos]
+
+            // lower_bound returns the first pos such that bool(hay[pos] <
+            // needle_v) is false, i.e. needle_v <= hay[pos]
+            pos = search_sorted_detail::lower_bound_indexed_impl(
+                hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer);
+        }
+        else {
+            // search in hay in right-closed interval: hay[pos - 1] <= needle_v
+            // < hay[pos]
+
+            // upper_bound returns the first pos such that bool(needle_v <
+            // hay[pos]) is true, i.e. needle_v < hay[pos]
+            pos = search_sorted_detail::upper_bound_indexed_impl(
+                hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer);
+        }
+
+        positions_tp[positions_indexer(i)] = pos;
+    }
+};
+
+typedef sycl::event (*searchsorted_contig_impl_fp_ptr_t)(
+    sycl::queue &,
+    const std::size_t,
+    const std::size_t,
+    const char *,
+    const ssize_t,
+    const char *,
+    const ssize_t,
+    char *,
+    const ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T1, typename T2, bool left_closed>
+class searchsorted_contig_impl_krn;
+
+template <typename argTy, typename indTy, bool left_closed, typename Compare>
+sycl::event searchsorted_contig_impl(sycl::queue &exec_q,
+                                     const std::size_t hay_nelems,
+                                     const std::size_t needles_nelems,
+                                     const char *hay_cp,
+                                     const ssize_t hay_offset,
+                                     const char *needles_cp,
+                                     const ssize_t needles_offset,
+                                     char *positions_cp,
+                                     const ssize_t positions_offset,
+                                     const std::vector<sycl::event> &depends)
+{
+    const argTy *hay_tp = reinterpret_cast<const argTy *>(hay_cp) + hay_offset;
+    const argTy *needles_tp =
+        reinterpret_cast<const argTy *>(needles_cp) + needles_offset;
+
+    indTy *positions_tp =
+        reinterpret_cast<indTy *>(positions_cp) + positions_offset;
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName =
+            class searchsorted_contig_impl_krn<argTy, indTy, left_closed>;
+
+        sycl::range<1> gRange(needles_nelems);
+
+        using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+        static constexpr TrivialIndexerT hay_indexer{};
+        static constexpr TrivialIndexerT needles_indexer{};
+        static constexpr TrivialIndexerT positions_indexer{};
+
+        const auto fnctr =
+            SearchSortedFunctor<argTy, indTy, left_closed, TrivialIndexerT,
+                                TrivialIndexerT, TrivialIndexerT, Compare>(
+                hay_tp, needles_tp, positions_tp, hay_nelems, hay_indexer,
+                needles_indexer, positions_indexer);
+
+        cgh.parallel_for<KernelName>(gRange, fnctr);
+    });
+
+    return comp_ev;
+}
+
+typedef sycl::event (*searchsorted_strided_impl_fp_ptr_t)(
+    sycl::queue &,
+    const std::size_t,
+    const std::size_t,
+    const char *,
+    const ssize_t,
+    const ssize_t,
+    const char *,
+    const ssize_t,
+    char *,
+    const ssize_t,
+    int,
+    const ssize_t *,
+    const std::vector<sycl::event> &);
+
+template <typename T1, typename T2, bool left_closed>
+class searchsorted_strided_impl_krn;
+
+template <typename argTy, typename indTy, bool left_closed, typename Compare>
+sycl::event searchsorted_strided_impl(
+    sycl::queue &exec_q,
+    const std::size_t hay_nelems,
+    const std::size_t needles_nelems,
+    const char *hay_cp,
+    const ssize_t hay_offset,
+    // hay is 1D, so hay_nelems, hay_offset, hay_stride describe strided array
+    const ssize_t hay_stride,
+    const char *needles_cp,
+    const ssize_t needles_offset,
+    char *positions_cp,
+    const ssize_t positions_offset,
+    const int needles_nd,
+    // packed_shape_strides is [needles_shape, needles_strides,
+    // positions_strides] has length of 3*needles_nd
+    const ssize_t *packed_shape_strides,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *hay_tp = reinterpret_cast<const argTy *>(hay_cp);
+    const argTy *needles_tp = reinterpret_cast<const argTy *>(needles_cp);
+
+    indTy *positions_tp = reinterpret_cast<indTy *>(positions_cp);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::range<1> gRange(needles_nelems);
+
+        using HayIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+        const HayIndexerT hay_indexer(
+            /* offset */ hay_offset,
+            /* size   */ hay_nelems,
+            /* step   */ hay_stride);
+
+        using NeedlesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+        const ssize_t *needles_shape_strides = packed_shape_strides;
+        const NeedlesIndexerT needles_indexer(needles_nd, needles_offset,
+                                              needles_shape_strides);
+        using PositionsIndexerT =
+            dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+        const ssize_t *positions_shape = packed_shape_strides;
+        const ssize_t *positions_strides =
+            packed_shape_strides + 2 * needles_nd;
+        const PositionsIndexerT positions_indexer(
+            needles_nd, positions_offset, positions_shape, positions_strides);
+
+        const auto fnctr =
+            SearchSortedFunctor<argTy, indTy, left_closed, HayIndexerT,
+                                NeedlesIndexerT, PositionsIndexerT, Compare>(
+                hay_tp, needles_tp, positions_tp, hay_nelems, hay_indexer,
+                needles_indexer, positions_indexer);
+        using KernelName =
+            class searchsorted_strided_impl_krn<argTy, indTy, left_closed>;
+
+        cgh.parallel_for<KernelName>(gRange, fnctr);
+    });
+
+    return comp_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
new file mode 100644
index 000000000000..7b48f310a445
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
@@ -0,0 +1,61 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+using dpctl::tensor::ssize_t;
+
+typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &,
+                                            std::size_t,
+                                            std::size_t,
+                                            const char *,
+                                            char *,
+                                            ssize_t,
+                                            ssize_t,
+                                            ssize_t,
+                                            ssize_t,
+                                            const std::vector<sycl::event> &);
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
new file mode 100644
index 000000000000..fd32905b808e
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
@@ -0,0 +1,144 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor sort/argsort operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::kernels::sort_utils_detail
+{
+
+namespace syclexp = sycl::ext::oneapi::experimental;
+
+template <class KernelName, typename T>
+sycl::event iota_impl(sycl::queue &exec_q,
+                      T *data,
+                      std::size_t nelems,
+                      const std::vector<sycl::event> &dependent_events)
+{
+    static constexpr std::uint32_t lws = 256;
+    static constexpr std::uint32_t n_wi = 4;
+    const std::size_t n_groups = (nelems + n_wi * lws - 1) / (n_wi * lws);
+
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::range<1> lRange{lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_events);
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_linear_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+
+            const std::size_t offset = (gid - lane_id) * n_wi;
+            const std::uint32_t max_sgSize = sg.get_max_local_range()[0];
+
+            std::array<T, n_wi> stripe{};
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                stripe[i] = T(offset + lane_id + i * max_sgSize);
+            }
+
+            if (offset + n_wi * max_sgSize < nelems) {
+                static constexpr auto group_ls_props =
+                    syclexp::properties{syclexp::data_placement_striped};
+
+                auto out_multi_ptr = sycl::address_space_cast<
+                    sycl::access::address_space::global_space,
+                    sycl::access::decorated::yes>(&data[offset]);
+
+                syclexp::group_store(sg, sycl::span<T, n_wi>{&stripe[0], n_wi},
+                                     out_multi_ptr, group_ls_props);
+            }
+            else {
+                for (std::size_t idx = offset + lane_id; idx < nelems;
+                     idx += max_sgSize) {
+                    data[idx] = T(idx);
+                }
+            }
+        });
+    });
+
+    return e;
+}
+
+template <class KernelName, typename IndexTy>
+sycl::event map_back_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          const IndexTy *flat_index_data,
+                          IndexTy *reduced_index_data,
+                          std::size_t row_size,
+                          const std::vector<sycl::event> &dependent_events)
+{
+    static constexpr std::uint32_t lws = 64;
+    static constexpr std::uint32_t n_wi = 4;
+    const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
+
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_events);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_linear_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+            const std::uint32_t sg_size = sg.get_max_local_range()[0];
+
+            const std::size_t start_id = (gid - lane_id) * n_wi + lane_id;
+
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                const std::size_t data_id = start_id + i * sg_size;
+
+                if (data_id < nelems) {
+                    const IndexTy linear_index = flat_index_data[data_id];
+                    reduced_index_data[data_id] = (linear_index % row_size);
+                }
+            }
+        });
+    });
+
+    return map_back_ev;
+}
+
+} // namespace dpctl::tensor::kernels::sort_utils_detail
diff --git a/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp
new file mode 100644
index 000000000000..1bbaa9e8345a
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -0,0 +1,508 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor topk operation.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <stdexcept>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/sorting/merge_sort.hpp"
+#include "kernels/sorting/radix_sort.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "kernels/sorting/sort_utils.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+
+namespace dpctl::tensor::kernels
+{
+
+namespace topk_detail
+{
+
+void scale_topk_params(const std::uint64_t nelems_per_slm,
+                       const std::size_t sub_groups_per_work_group,
+                       const std::uint32_t elems_per_wi,
+                       const std::vector<std::size_t> &sg_sizes,
+                       std::size_t &lws,
+                       std::size_t &nelems_wg_sorts)
+{
+    for (auto it = sg_sizes.rbegin(); it != sg_sizes.rend(); ++it) {
+        auto sg_size = *it;
+        lws = sub_groups_per_work_group * sg_size;
+        nelems_wg_sorts = elems_per_wi * lws;
+        if (nelems_wg_sorts < nelems_per_slm) {
+            return;
+        }
+    }
+    // should never reach
+    throw std::runtime_error("Could not construct top k kernel parameters");
+}
+
+template <class KernelName, typename argTy, typename IndexTy>
+sycl::event write_out_impl(sycl::queue &exec_q,
+                           std::size_t iter_nelems,
+                           std::size_t k,
+                           const argTy *arg_tp,
+                           const IndexTy *index_data,
+                           std::size_t iter_index_stride,
+                           std::size_t axis_nelems,
+                           argTy *vals_tp,
+                           IndexTy *inds_tp,
+                           const std::vector<sycl::event> &depends)
+{
+    static constexpr std::uint32_t lws = 64;
+    static constexpr std::uint32_t n_wi = 4;
+    const std::size_t nelems = iter_nelems * k;
+    const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
+
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_linear_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+            const std::uint32_t sg_size = sg.get_max_local_range()[0];
+
+            const std::size_t start_id = (gid - lane_id) * n_wi + lane_id;
+
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                const std::size_t data_id = start_id + i * sg_size;
+
+                if (data_id < nelems) {
+                    const std::size_t iter_id = data_id / k;
+
+                    /*
+                    const std::size_t axis_gid = data_id - (iter_gid * k);
+                    const std::size_t src_idx = iter_gid * iter_index_stride +
+                    axis_gid;
+                    */
+                    const std::size_t src_idx =
+                        data_id + iter_id * (iter_index_stride - k);
+
+                    const IndexTy res_ind = index_data[src_idx];
+                    const argTy v = arg_tp[res_ind];
+
+                    const std::size_t dst_idx = data_id;
+                    vals_tp[dst_idx] = v;
+                    inds_tp[dst_idx] = (res_ind % axis_nelems);
+                }
+            }
+        });
+    });
+
+    return write_out_ev;
+}
+
+} // namespace topk_detail
+
+template <typename T1, typename T2>
+class topk_populate_index_data_krn;
+
+template <typename T1, typename T2>
+class topk_full_merge_map_back_krn;
+
+template <typename argTy, typename IndexTy, typename CompT>
+sycl::event
+    topk_full_merge_sort_impl(sycl::queue &exec_q,
+                              std::size_t iter_nelems, // number of sub-arrays
+                              std::size_t axis_nelems, // size of each sub-array
+                              std::size_t k,
+                              const argTy *arg_tp,
+                              argTy *vals_tp,
+                              IndexTy *inds_tp,
+                              const CompT &comp,
+                              const std::vector<sycl::event> &depends)
+{
+    auto index_data_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+            iter_nelems * axis_nelems, exec_q);
+    // extract USM pointer
+    IndexTy *index_data = index_data_owner.get();
+
+    using IotaKernelName = topk_populate_index_data_krn<argTy, IndexTy>;
+
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, index_data, iter_nelems * axis_nelems, depends);
+
+    std::size_t sorted_block_size;
+    // Sort segments of the array
+    sycl::event base_sort_ev =
+        merge_sort_detail::sort_over_work_group_contig_impl(
+            exec_q, iter_nelems, axis_nelems, index_data, index_data, comp,
+            sorted_block_size, // modified in place with size of sorted block
+                               // size
+            {populate_indexed_data_ev});
+
+    // Merge segments in parallel until all elements are sorted
+    sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl(
+        exec_q, iter_nelems, axis_nelems, index_data, comp, sorted_block_size,
+        {base_sort_ev});
+
+    using WriteOutKernelName = topk_full_merge_map_back_krn<argTy, IndexTy>;
+
+    sycl::event write_out_ev =
+        topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+            exec_q, iter_nelems, k, arg_tp, index_data, axis_nelems,
+            axis_nelems, vals_tp, inds_tp, {merges_ev});
+
+    sycl::event cleanup_host_task_event =
+        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev},
+                                                     index_data_owner);
+
+    return cleanup_host_task_event;
+};
+
+template <typename T1, typename T2>
+class topk_partial_merge_map_back_krn;
+
+template <typename T1, typename T2, typename Comp>
+class topk_over_work_group_krn;
+
+template <typename argTy,
+          typename IndexTy,
+          typename ValueComp = std::less<argTy>>
+sycl::event topk_merge_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows
+                             // in a matrix when sorting over rows)
+    std::size_t axis_nelems, // size of each array to sort  (length of
+                             // rows, i.e. number of columns)
+    std::size_t k,
+    const char *arg_cp,
+    char *vals_cp,
+    char *inds_cp,
+    const std::vector<sycl::event> &depends)
+{
+    if (axis_nelems < k) {
+        throw std::runtime_error("Invalid sort axis size for value of k");
+    }
+
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    argTy *vals_tp = reinterpret_cast<argTy *>(vals_cp);
+    IndexTy *inds_tp = reinterpret_cast<IndexTy *>(inds_cp);
+
+    using dpctl::tensor::kernels::IndexComp;
+    const IndexComp<IndexTy, argTy, ValueComp> index_comp{arg_tp, ValueComp{}};
+
+    if (axis_nelems <= 512 || k >= 1024 || k > axis_nelems / 2) {
+        return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems, k,
+                                         arg_tp, vals_tp, inds_tp, index_comp,
+                                         depends);
+    }
+    else {
+        using PartialKernelName =
+            topk_over_work_group_krn<IndexTy, IndexTy, ValueComp>;
+
+        const auto &kernel_id = sycl::get_kernel_id<PartialKernelName>();
+
+        auto const &ctx = exec_q.get_context();
+        auto const &dev = exec_q.get_device();
+
+        auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+            ctx, {dev}, {kernel_id});
+
+        auto krn = kb.get_kernel(kernel_id);
+
+        const std::uint32_t max_sg_size = krn.template get_info<
+            sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+        const std::uint64_t device_local_memory_size =
+            dev.get_info<sycl::info::device::local_mem_size>();
+
+        //  leave 512 bytes of local memory for RT
+        const std::uint64_t safety_margin = 512;
+
+        const std::uint64_t nelems_per_slm =
+            (device_local_memory_size - safety_margin) / (2 * sizeof(IndexTy));
+
+        static constexpr std::uint32_t sub_groups_per_work_group = 4;
+        const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2;
+
+        std::size_t lws = sub_groups_per_work_group * max_sg_size;
+
+        std::size_t sorted_block_size = elems_per_wi * lws;
+        if (sorted_block_size > nelems_per_slm) {
+            const std::vector<std::size_t> sg_sizes =
+                dev.get_info<sycl::info::device::sub_group_sizes>();
+            topk_detail::scale_topk_params(
+                nelems_per_slm, sub_groups_per_work_group, elems_per_wi,
+                sg_sizes,
+                lws,              // modified by reference
+                sorted_block_size // modified by reference
+            );
+        }
+
+        // This assumption permits doing away with using a loop
+        assert(sorted_block_size % lws == 0);
+
+        using search_sorted_detail::quotient_ceil;
+        const std::size_t n_segments =
+            quotient_ceil<std::size_t>(axis_nelems, sorted_block_size);
+
+        // round k up for the later merge kernel if necessary
+        const std::size_t round_k_to = dev.has(sycl::aspect::cpu) ? 32 : 4;
+        std::size_t k_rounded =
+            (k < round_k_to)
+                ? k
+                : quotient_ceil<std::size_t>(k, round_k_to) * round_k_to;
+
+        // get length of tail for alloc size
+        auto rem = axis_nelems % sorted_block_size;
+        auto alloc_len = (rem && rem < k_rounded)
+                             ? rem + k_rounded * (n_segments - 1)
+                             : k_rounded * n_segments;
+
+        // if allocation would be sufficiently large or k is larger than
+        // elements processed, use full sort
+        if (k_rounded >= axis_nelems || k_rounded >= sorted_block_size ||
+            alloc_len >= axis_nelems / 2) {
+            return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems,
+                                             k, arg_tp, vals_tp, inds_tp,
+                                             index_comp, depends);
+        }
+
+        auto index_data_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+                iter_nelems * alloc_len, exec_q);
+        // get raw USM pointer
+        IndexTy *index_data = index_data_owner.get();
+
+        // no need to populate index data: SLM will be populated with default
+        // values
+
+        sycl::event base_sort_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.use_kernel_bundle(kb);
+
+            sycl::range<1> global_range{iter_nelems * n_segments * lws};
+            sycl::range<1> local_range{lws};
+
+            sycl::range<1> slm_range{sorted_block_size};
+            sycl::local_accessor<IndexTy, 1> work_space(slm_range, cgh);
+            sycl::local_accessor<IndexTy, 1> scratch_space(slm_range, cgh);
+
+            sycl::nd_range<1> ndRange(global_range, local_range);
+
+            cgh.parallel_for<PartialKernelName>(
+                ndRange, [=](sycl::nd_item<1> it) {
+                    const std::size_t group_id = it.get_group_linear_id();
+                    const std::size_t iter_id = group_id / n_segments;
+                    const std::size_t segment_id =
+                        group_id - iter_id * n_segments;
+                    const std::size_t lid = it.get_local_linear_id();
+
+                    const std::size_t segment_start_idx =
+                        segment_id * sorted_block_size;
+                    const std::size_t segment_end_idx = std::min<std::size_t>(
+                        segment_start_idx + sorted_block_size, axis_nelems);
+                    const std::size_t wg_chunk_size =
+                        segment_end_idx - segment_start_idx;
+
+                    // load input into SLM
+                    for (std::size_t array_id = segment_start_idx + lid;
+                         array_id < segment_end_idx; array_id += lws) {
+                        IndexTy v = (array_id < axis_nelems)
+                                        ? iter_id * axis_nelems + array_id
+                                        : IndexTy{};
+                        work_space[array_id - segment_start_idx] = v;
+                    }
+                    sycl::group_barrier(it.get_group());
+
+                    const std::size_t chunk =
+                        quotient_ceil<std::size_t>(sorted_block_size, lws);
+
+                    const std::size_t chunk_start_idx = lid * chunk;
+                    const std::size_t chunk_end_idx =
+                        sycl::min(chunk_start_idx + chunk, wg_chunk_size);
+
+                    merge_sort_detail::leaf_sort_impl(
+                        work_space, chunk_start_idx, chunk_end_idx, index_comp);
+
+                    sycl::group_barrier(it.get_group());
+
+                    bool data_in_temp = false;
+                    std::size_t n_chunks_merged = 1;
+
+                    // merge chunk while n_chunks_merged * chunk < wg_chunk_size
+                    const std::size_t max_chunks_merged =
+                        1 + ((wg_chunk_size - 1) / chunk);
+                    for (; n_chunks_merged < max_chunks_merged;
+                         data_in_temp = !data_in_temp, n_chunks_merged *= 2) {
+                        const std::size_t nelems_sorted_so_far =
+                            n_chunks_merged * chunk;
+                        const std::size_t q = (lid / n_chunks_merged);
+                        const std::size_t start_1 = sycl::min(
+                            2 * nelems_sorted_so_far * q, wg_chunk_size);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + nelems_sorted_so_far, wg_chunk_size);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + nelems_sorted_so_far, wg_chunk_size);
+                        const std::size_t offset =
+                            chunk * (lid - q * n_chunks_merged);
+
+                        if (data_in_temp) {
+                            merge_sort_detail::merge_impl(
+                                offset, scratch_space, work_space, start_1,
+                                end_1, end_2, start_1, index_comp, chunk);
+                        }
+                        else {
+                            merge_sort_detail::merge_impl(
+                                offset, work_space, scratch_space, start_1,
+                                end_1, end_2, start_1, index_comp, chunk);
+                        }
+                        sycl::group_barrier(it.get_group());
+                    }
+
+                    // output assumed to be structured as (iter_nelems,
+                    // alloc_len)
+                    const std::size_t k_segment_start_idx =
+                        segment_id * k_rounded;
+                    const std::size_t k_segment_end_idx = std::min<std::size_t>(
+                        k_segment_start_idx + k_rounded, alloc_len);
+                    const auto &out_src =
+                        (data_in_temp) ? scratch_space : work_space;
+                    for (std::size_t array_id = k_segment_start_idx + lid;
+                         array_id < k_segment_end_idx; array_id += lws) {
+                        if (lid < k_rounded) {
+                            index_data[iter_id * alloc_len + array_id] =
+                                out_src[array_id - k_segment_start_idx];
+                        }
+                    }
+                });
+        });
+
+        // Merge segments in parallel until all elements are sorted
+        sycl::event merges_ev =
+            merge_sort_detail::merge_sorted_block_contig_impl(
+                exec_q, iter_nelems, alloc_len, index_data, index_comp,
+                k_rounded, {base_sort_ev});
+
+        // Write out top k of the merge-sorted memory
+        using WriteOutKernelName =
+            topk_partial_merge_map_back_krn<argTy, IndexTy>;
+
+        sycl::event write_topk_ev =
+            topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+                exec_q, iter_nelems, k, arg_tp, index_data, alloc_len,
+                axis_nelems, vals_tp, inds_tp, {merges_ev});
+
+        sycl::event cleanup_host_task_event =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {write_topk_ev}, index_data_owner);
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename T1, typename T2>
+class topk_iota_krn;
+
+template <typename T1, typename T2>
+class topk_radix_map_back_krn;
+
+template <typename argTy, typename IndexTy>
+sycl::event topk_radix_impl(sycl::queue &exec_q,
+                            std::size_t iter_nelems, // number of sub-arrays
+                            std::size_t axis_nelems, // size of each sub-array
+                            std::size_t k,
+                            bool ascending,
+                            const char *arg_cp,
+                            char *vals_cp,
+                            char *inds_cp,
+                            const std::vector<sycl::event> &depends)
+{
+    if (axis_nelems < k) {
+        throw std::runtime_error("Invalid sort axis size for value of k");
+    }
+
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    argTy *vals_tp = reinterpret_cast<argTy *>(vals_cp);
+    IndexTy *inds_tp = reinterpret_cast<IndexTy *>(inds_cp);
+
+    const std::size_t total_nelems = iter_nelems * axis_nelems;
+    const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
+    auto workspace_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+            padded_total_nelems + total_nelems, exec_q);
+
+    // get raw USM pointer
+    IndexTy *workspace = workspace_owner.get();
+    IndexTy *tmp_tp = workspace + padded_total_nelems;
+
+    using IdentityProjT = radix_sort_details::IdentityProj;
+    using IndexedProjT =
+        radix_sort_details::IndexedProj<IndexTy, argTy, IdentityProjT>;
+    const IndexedProjT proj_op{arg_tp};
+
+    using IotaKernelName = topk_iota_krn<argTy, IndexTy>;
+
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, workspace, total_nelems, depends);
+
+    sycl::event radix_sort_ev =
+        radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
+            exec_q, iter_nelems, axis_nelems, workspace, tmp_tp, proj_op,
+            ascending, {iota_ev});
+
+    // Write out top k of the temporary
+    using WriteOutKernelName = topk_radix_map_back_krn<argTy, IndexTy>;
+
+    sycl::event write_topk_ev =
+        topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+            exec_q, iter_nelems, k, arg_tp, tmp_tp, axis_nelems, axis_nelems,
+            vals_tp, inds_tp, {radix_sort_ev});
+
+    sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {write_topk_ev}, workspace_owner);
+
+    return cleanup_ev;
+}
+
+} // namespace dpctl::tensor::kernels
diff --git a/dpnp/tensor/libtensor/include/kernels/where.hpp b/dpnp/tensor/libtensor/include/kernels/where.hpp
new file mode 100644
index 000000000000..5527cccec8d2
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/kernels/where.hpp
@@ -0,0 +1,336 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for dpctl.tensor.where.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::kernels::search
+{
+
+using dpctl::tensor::ssize_t;
+using namespace dpctl::tensor::offset_utils;
+
+using dpctl::tensor::kernels::alignment_utils::
+    disabled_sg_loadstore_wrapper_krn;
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+using dpctl::tensor::sycl_utils::sub_group_load;
+using dpctl::tensor::sycl_utils::sub_group_store;
+
+template <typename T, typename condT, typename IndexerT>
+class where_strided_kernel;
+template <typename T, typename condT, std::uint8_t vec_sz, std::uint8_t n_vecs>
+class where_contig_kernel;
+
+template <typename T,
+          typename condT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+class WhereContigFunctor
+{
+private:
+    std::size_t nelems = 0;
+    const condT *cond_p = nullptr;
+    const T *x1_p = nullptr;
+    const T *x2_p = nullptr;
+    T *dst_p = nullptr;
+
+public:
+    WhereContigFunctor(std::size_t nelems_,
+                       const condT *cond_p_,
+                       const T *x1_p_,
+                       const T *x2_p_,
+                       T *dst_p_)
+        : nelems(nelems_), cond_p(cond_p_), x1_p(x1_p_), x2_p(x2_p_),
+          dst_p(dst_p_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!enable_sg_loadstore || is_complex<condT>::value ||
+                      is_complex<T>::value) {
+            const std::uint16_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+
+            const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi;
+            const std::size_t start =
+                (gid / sgSize) * (nelems_per_sg - sgSize) + gid;
+            const std::size_t end = std::min(nelems, start + nelems_per_sg);
+            for (std::size_t offset = start; offset < end; offset += sgSize) {
+                using dpctl::tensor::type_utils::convert_impl;
+                const bool check = convert_impl<bool, condT>(cond_p[offset]);
+                dst_p[offset] = check ? x1_p[offset] : x2_p[offset];
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint16_t sgSize = sg.get_max_local_range()[0];
+
+            const std::size_t base =
+                nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
+                                 sg.get_group_id()[0] * sgSize);
+
+            if (base + nelems_per_wi * sgSize < nelems) {
+                sycl::vec<T, vec_sz> dst_vec;
+
+#pragma unroll
+                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    const std::size_t idx = base + it * sgSize;
+                    auto x1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x1_p[idx]);
+                    auto x2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x2_p[idx]);
+                    auto cond_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&cond_p[idx]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[idx]);
+
+                    const sycl::vec<T, vec_sz> x1_vec =
+                        sub_group_load<vec_sz>(sg, x1_multi_ptr);
+                    const sycl::vec<T, vec_sz> x2_vec =
+                        sub_group_load<vec_sz>(sg, x2_multi_ptr);
+                    const sycl::vec<condT, vec_sz> cond_vec =
+                        sub_group_load<vec_sz>(sg, cond_multi_ptr);
+#pragma unroll
+                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
+                        dst_vec[k] = cond_vec[k] ? x1_vec[k] : x2_vec[k];
+                    }
+                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
+                }
+            }
+            else {
+                const std::size_t lane_id = sg.get_local_id()[0];
+                for (std::size_t k = base + lane_id; k < nelems; k += sgSize) {
+                    dst_p[k] = cond_p[k] ? x1_p[k] : x2_p[k];
+                }
+            }
+        }
+    }
+};
+
+typedef sycl::event (*where_contig_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename condT>
+sycl::event where_contig_impl(sycl::queue &q,
+                              std::size_t nelems,
+                              const char *cond_cp,
+                              const char *x1_cp,
+                              const char *x2_cp,
+                              char *dst_cp,
+                              const std::vector<sycl::event> &depends)
+{
+    const condT *cond_tp = reinterpret_cast<const condT *>(cond_cp);
+    const T *x1_tp = reinterpret_cast<const T *>(x1_cp);
+    const T *x2_tp = reinterpret_cast<const T *>(x2_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event where_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        std::size_t lws = 64;
+        static constexpr std::uint8_t vec_sz = 4u;
+        static constexpr std::uint8_t n_vecs = 2u;
+        const std::size_t n_groups =
+            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
+        const auto gws_range = sycl::range<1>(n_groups * lws);
+        const auto lws_range = sycl::range<1>(lws);
+
+        if (is_aligned<required_alignment>(cond_cp) &&
+            is_aligned<required_alignment>(x1_cp) &&
+            is_aligned<required_alignment>(x2_cp) &&
+            is_aligned<required_alignment>(dst_cp)) {
+            static constexpr bool enable_sg_loadstore = true;
+            using KernelName = where_contig_kernel<T, condT, vec_sz, n_vecs>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                WhereContigFunctor<T, condT, vec_sz, n_vecs,
+                                   enable_sg_loadstore>(nelems, cond_tp, x1_tp,
+                                                        x2_tp, dst_tp));
+        }
+        else {
+            static constexpr bool disable_sg_loadstore = false;
+            using InnerKernelName =
+                where_contig_kernel<T, condT, vec_sz, n_vecs>;
+            using KernelName =
+                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(gws_range, lws_range),
+                WhereContigFunctor<T, condT, vec_sz, n_vecs,
+                                   disable_sg_loadstore>(nelems, cond_tp, x1_tp,
+                                                         x2_tp, dst_tp));
+        }
+    });
+
+    return where_ev;
+}
+
+template <typename T, typename condT, typename IndexerT>
+class WhereStridedFunctor
+{
+private:
+    const T *x1_p = nullptr;
+    const T *x2_p = nullptr;
+    T *dst_p = nullptr;
+    const condT *cond_p = nullptr;
+    IndexerT indexer;
+
+public:
+    WhereStridedFunctor(const condT *cond_p_,
+                        const T *x1_p_,
+                        const T *x2_p_,
+                        T *dst_p_,
+                        const IndexerT &indexer_)
+        : x1_p(x1_p_), x2_p(x2_p_), dst_p(dst_p_), cond_p(cond_p_),
+          indexer(indexer_)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+        std::size_t gid = id[0];
+        auto offsets = indexer(static_cast<ssize_t>(gid));
+
+        using dpctl::tensor::type_utils::convert_impl;
+        bool check =
+            convert_impl<bool, condT>(cond_p[offsets.get_first_offset()]);
+
+        dst_p[offsets.get_fourth_offset()] =
+            check ? x1_p[offsets.get_second_offset()]
+                  : x2_p[offsets.get_third_offset()];
+    }
+};
+
+typedef sycl::event (*where_strided_impl_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const char *,
+    const char *,
+    const char *,
+    char *,
+    const ssize_t *,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename condT>
+sycl::event where_strided_impl(sycl::queue &q,
+                               std::size_t nelems,
+                               int nd,
+                               const char *cond_cp,
+                               const char *x1_cp,
+                               const char *x2_cp,
+                               char *dst_cp,
+                               const ssize_t *shape_strides,
+                               ssize_t x1_offset,
+                               ssize_t x2_offset,
+                               ssize_t cond_offset,
+                               ssize_t dst_offset,
+                               const std::vector<sycl::event> &depends)
+{
+    const condT *cond_tp = reinterpret_cast<const condT *>(cond_cp);
+    const T *x1_tp = reinterpret_cast<const T *>(x1_cp);
+    const T *x2_tp = reinterpret_cast<const T *>(x2_cp);
+    T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+    sycl::event where_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const FourOffsets_StridedIndexer indexer{
+            nd, cond_offset, x1_offset, x2_offset, dst_offset, shape_strides};
+
+        cgh.parallel_for<
+            where_strided_kernel<T, condT, FourOffsets_StridedIndexer>>(
+            sycl::range<1>(nelems),
+            WhereStridedFunctor<T, condT, FourOffsets_StridedIndexer>(
+                cond_tp, x1_tp, x2_tp, dst_tp, indexer));
+    });
+
+    return where_ev;
+}
+
+template <typename fnT, typename T, typename condT>
+struct WhereStridedFactory
+{
+    fnT get()
+    {
+        fnT fn = where_strided_impl<T, condT>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T, typename condT>
+struct WhereContigFactory
+{
+    fnT get()
+    {
+        fnT fn = where_contig_impl<T, condT>;
+        return fn;
+    }
+};
+
+} // namespace dpctl::tensor::kernels::search
diff --git a/dpnp/tensor/libtensor/include/utils/indexing_utils.hpp b/dpnp/tensor/libtensor/include/utils/indexing_utils.hpp
new file mode 100644
index 000000000000..d28c8174c39c
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/utils/indexing_utils.hpp
@@ -0,0 +1,153 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utilities for handling out-of-bounds integer indices in
+/// kernels that involve indexing operations, such as take, put, or advanced
+/// tensor integer indexing.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+
+namespace dpctl::tensor::indexing_utils
+{
+using dpctl::tensor::ssize_t;
+
+/*
+ * ssize_t for indices is a design choice, dpctl::tensor::usm_ndarray
+ * uses py::ssize_t for shapes and strides internally and Python uses
+ * py_ssize_t for sizes of e.g. lists.
+ */
+
+template <typename IndT>
+struct WrapIndex
+{
+    static_assert(std::is_integral_v<IndT>);
+
+    ssize_t operator()(ssize_t max_item, IndT ind) const
+    {
+        ssize_t projected;
+        static constexpr ssize_t unit(1);
+        max_item = sycl::max(max_item, unit);
+
+        static constexpr std::uintmax_t ind_max =
+            std::numeric_limits<IndT>::max();
+        static constexpr std::uintmax_t ssize_max =
+            std::numeric_limits<ssize_t>::max();
+
+        if constexpr (std::is_signed_v<IndT>) {
+            static constexpr std::intmax_t ind_min =
+                std::numeric_limits<IndT>::min();
+            static constexpr std::intmax_t ssize_min =
+                std::numeric_limits<ssize_t>::min();
+
+            if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) {
+                const ssize_t ind_ = static_cast<ssize_t>(ind);
+                const ssize_t lb = -max_item;
+                const ssize_t ub = max_item - 1;
+                projected = sycl::clamp(ind_, lb, ub);
+            }
+            else {
+                const IndT lb = static_cast<IndT>(-max_item);
+                const IndT ub = static_cast<IndT>(max_item - 1);
+                projected = static_cast<ssize_t>(sycl::clamp(ind, lb, ub));
+            }
+            return (projected < 0) ? projected + max_item : projected;
+        }
+        else {
+            if constexpr (ind_max <= ssize_max) {
+                const ssize_t ind_ = static_cast<ssize_t>(ind);
+                const ssize_t ub = max_item - 1;
+                projected = sycl::min(ind_, ub);
+            }
+            else {
+                const IndT ub = static_cast<IndT>(max_item - 1);
+                projected = static_cast<ssize_t>(sycl::min(ind, ub));
+            }
+            return projected;
+        }
+    }
+};
+
+template <typename IndT>
+struct ClipIndex
+{
+    static_assert(std::is_integral_v<IndT>);
+
+    ssize_t operator()(ssize_t max_item, IndT ind) const
+    {
+        ssize_t projected;
+        static constexpr ssize_t unit(1);
+        max_item = sycl::max<ssize_t>(max_item, unit);
+
+        static constexpr std::uintmax_t ind_max =
+            std::numeric_limits<IndT>::max();
+        static constexpr std::uintmax_t ssize_max =
+            std::numeric_limits<ssize_t>::max();
+        if constexpr (std::is_signed_v<IndT>) {
+            static constexpr std::intmax_t ind_min =
+                std::numeric_limits<IndT>::min();
+            static constexpr std::intmax_t ssize_min =
+                std::numeric_limits<ssize_t>::min();
+
+            if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) {
+                const ssize_t ind_ = static_cast<ssize_t>(ind);
+                static constexpr ssize_t lb(0);
+                const ssize_t ub = max_item - 1;
+                projected = sycl::clamp(ind_, lb, ub);
+            }
+            else {
+                static constexpr IndT lb(0);
+                const IndT ub = static_cast<IndT>(max_item - 1);
+                projected = static_cast<std::size_t>(sycl::clamp(ind, lb, ub));
+            }
+        }
+        else {
+            if constexpr (ind_max <= ssize_max) {
+                const ssize_t ind_ = static_cast<ssize_t>(ind);
+                const ssize_t ub = max_item - 1;
+                projected = sycl::min(ind_, ub);
+            }
+            else {
+                const IndT ub = static_cast<IndT>(max_item - 1);
+                projected = static_cast<ssize_t>(sycl::min(ind, ub));
+            }
+        }
+        return projected;
+    }
+};
+} // namespace dpctl::tensor::indexing_utils
diff --git a/dpnp/tensor/libtensor/include/utils/math_utils.hpp b/dpnp/tensor/libtensor/include/utils/math_utils.hpp
new file mode 100644
index 000000000000..d35eff0074dc
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/utils/math_utils.hpp
@@ -0,0 +1,148 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines math utility functions.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cmath>
+#include <complex>
+#include <limits>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::math_utils
+{
+template <typename T>
+bool less_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    return (real1 == real2)
+               ? (imag1 < imag2)
+               : (real1 < real2 && !std::isnan(imag1) && !std::isnan(imag2));
+}
+
+template <typename T>
+bool greater_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    return (real1 == real2)
+               ? (imag1 > imag2)
+               : (real1 > real2 && !std::isnan(imag1) && !std::isnan(imag2));
+}
+
+template <typename T>
+bool less_equal_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    return (real1 == real2)
+               ? (imag1 <= imag2)
+               : (real1 < real2 && !std::isnan(imag1) && !std::isnan(imag2));
+}
+
+template <typename T>
+bool greater_equal_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    return (real1 == real2)
+               ? (imag1 >= imag2)
+               : (real1 > real2 && !std::isnan(imag1) && !std::isnan(imag2));
+}
+
+template <typename T>
+T max_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    bool isnan_imag1 = std::isnan(imag1);
+    bool gt = (real1 == real2)
+                  ? (imag1 > imag2)
+                  : (real1 > real2 && !isnan_imag1 && !std::isnan(imag2));
+    return (std::isnan(real1) || isnan_imag1 || gt) ? x1 : x2;
+}
+
+template <typename T>
+T min_complex(const T &x1, const T &x2)
+{
+    using realT = typename T::value_type;
+    realT real1 = std::real(x1);
+    realT real2 = std::real(x2);
+    realT imag1 = std::imag(x1);
+    realT imag2 = std::imag(x2);
+
+    bool isnan_imag1 = std::isnan(imag1);
+    bool lt = (real1 == real2)
+                  ? (imag1 < imag2)
+                  : (real1 < real2 && !isnan_imag1 && !std::isnan(imag2));
+    return (std::isnan(real1) || isnan_imag1 || lt) ? x1 : x2;
+}
+
+template <typename T>
+T logaddexp(T x, T y)
+{
+    if (x == y) { // handle signed infinities
+        const T log2 = sycl::log(T(2));
+        return x + log2;
+    }
+    else {
+        const T tmp = x - y;
+        static constexpr T zero(0);
+
+        return (tmp > zero)
+                   ? (x + sycl::log1p(sycl::exp(-tmp)))
+                   : ((tmp <= zero) ? y + sycl::log1p(sycl::exp(tmp))
+                                    : std::numeric_limits<T>::quiet_NaN());
+    }
+}
+} // namespace dpctl::tensor::math_utils
diff --git a/dpnp/tensor/libtensor/include/utils/memory_overlap.hpp b/dpnp/tensor/libtensor/include/utils/memory_overlap.hpp
new file mode 100644
index 000000000000..b534e55b3192
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/utils/memory_overlap.hpp
@@ -0,0 +1,157 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utility to determine whether two arrays have memory
+/// overlap.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <iterator>
+
+#include <pybind11/pybind11.h>
+
+#include "dpnp4pybind11.hpp"
+
+/* @brief check for overlap of memory regions behind arrays.
+
+Presently assume that array occupies all bytes between smallest and largest
+displaced elements.
+
+TODO: Write proper Frobenius solver to account for holes, e.g.
+   overlap( x_contig[::2], x_contig[1::2]) should give False,
+   while this implementation gives True.
+*/
+namespace dpctl::tensor::overlap
+{
+namespace py = pybind11;
+
+struct MemoryOverlap
+{
+    bool operator()(dpctl::tensor::usm_ndarray ar1,
+                    dpctl::tensor::usm_ndarray ar2) const
+    {
+        const char *ar1_data = ar1.get_data();
+
+        const auto &ar1_offsets = ar1.get_minmax_offsets();
+        py::ssize_t ar1_elem_size =
+            static_cast<py::ssize_t>(ar1.get_elemsize());
+
+        const char *ar2_data = ar2.get_data();
+        const auto &ar2_offsets = ar2.get_minmax_offsets();
+        py::ssize_t ar2_elem_size =
+            static_cast<py::ssize_t>(ar2.get_elemsize());
+
+        /* Memory of array1 extends from  */
+        /*    [ar1_data + ar1_offsets.first * ar1_elem_size, ar1_data +
+         * ar1_offsets.second * ar1_elem_size + ar1_elem_size] */
+        /* Memory of array2 extends from */
+        /*    [ar2_data + ar2_offsets.first * ar2_elem_size, ar2_data +
+         * ar2_offsets.second * ar2_elem_size + ar2_elem_size] */
+
+        /* Intervals [x0, x1] and [y0, y1] do not overlap if (x0 <= x1) && (y0
+         * <= y1)
+         * && (x1 <=y0 || y1 <= x0 ) */
+        /* Given that x0 <= x1 and y0 <= y1 are true by construction, the
+         * condition for overlap us (x1 > y0) && (y1 > x0) */
+
+        /*  Applying:
+            (ar1_data + ar1_offsets.second * ar1_elem_size + ar1_elem_size >
+        ar2_data
+        + ar2_offsets.first * ar2_elem_size) && (ar2_data + ar2_offsets.second *
+        ar2_elem_size + ar2_elem_size > ar1_data + ar1_offsets.first *
+        ar1_elem_size)
+        */
+
+        auto byte_distance = static_cast<py::ssize_t>(ar2_data - ar1_data);
+
+        py::ssize_t x1_minus_y0 =
+            (-byte_distance +
+             (ar1_elem_size + (ar1_offsets.second * ar1_elem_size) -
+              (ar2_offsets.first * ar2_elem_size)));
+
+        py::ssize_t y1_minus_x0 =
+            (byte_distance +
+             (ar2_elem_size + (ar2_offsets.second * ar2_elem_size) -
+              (ar1_offsets.first * ar1_elem_size)));
+
+        bool memory_overlap = (x1_minus_y0 > 0) && (y1_minus_x0 > 0);
+
+        return memory_overlap;
+    }
+};
+
+struct SameLogicalTensors
+{
+    bool operator()(dpctl::tensor::usm_ndarray ar1,
+                    dpctl::tensor::usm_ndarray ar2) const
+    {
+        // Same ndim
+        int nd1 = ar1.get_ndim();
+        if (nd1 != ar2.get_ndim())
+            return false;
+
+        // Same dtype
+        int tn1 = ar1.get_typenum();
+        if (tn1 != ar2.get_typenum())
+            return false;
+
+        // Same pointer
+        const char *ar1_data = ar1.get_data();
+        const char *ar2_data = ar2.get_data();
+
+        if (ar1_data != ar2_data)
+            return false;
+
+        // Same shape and strides
+        const py::ssize_t *ar1_shape = ar1.get_shape_raw();
+        const py::ssize_t *ar2_shape = ar2.get_shape_raw();
+
+        if (!std::equal(ar1_shape, ar1_shape + nd1, ar2_shape))
+            return false;
+
+        // Same shape and strides
+        auto const &ar1_strides = ar1.get_strides_vector();
+        auto const &ar2_strides = ar2.get_strides_vector();
+
+        auto ar1_beg_it = std::begin(ar1_strides);
+        auto ar1_end_it = std::end(ar1_strides);
+
+        auto ar2_beg_it = std::begin(ar2_strides);
+
+        if (!std::equal(ar1_beg_it, ar1_end_it, ar2_beg_it))
+            return false;
+
+        // all checks passed: arrays are logical views
+        // into the same memory
+        return true;
+    }
+};
+} // namespace dpctl::tensor::overlap
diff --git a/dpnp/tensor/libtensor/include/utils/offset_utils.hpp b/dpnp/tensor/libtensor/include/utils/offset_utils.hpp
new file mode 100644
index 000000000000..3a6ac75dfc3a
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/utils/offset_utils.hpp
@@ -0,0 +1,788 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines Indexer callable operator to compute element offset in
+/// an array addressed by gloabl_id.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "utils/strided_iters.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+
+namespace dpctl::tensor::offset_utils
+{
+namespace detail
+{
+struct sink_t
+{
+    sink_t() {};
+    template <class T>
+    sink_t(T &&) {};
+};
+
+template <class V>
+std::size_t __accumulate_size(std::size_t &s, V &&v)
+{
+    return s += v.size();
+}
+
+template <class V, class U>
+sink_t __appender(V &lhs, U &&rhs)
+{
+    lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+    return {};
+}
+
+template <typename T, typename A, typename... Vs>
+std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&...vs)
+{
+    std::size_t s = lhs.size();
+    {
+        // limited scope ensures array is freed
+        [[maybe_unused]] sink_t tmp[] = {__accumulate_size(s, vs)..., 0};
+    }
+    lhs.reserve(s);
+    {
+        // array of no-data objects ensures ordering of calls to the appender
+        [[maybe_unused]] sink_t tmp[] = {
+            __appender(lhs, std::forward<Vs>(vs))..., 0};
+    }
+
+    return std::move(lhs); // prevent return-value optimization
+}
+} // namespace detail
+
+template <typename indT, typename... Vs>
+std::tuple<std::unique_ptr<indT, dpctl::tensor::alloc_utils::USMDeleter>,
+           std::size_t,
+           sycl::event>
+    device_allocate_and_pack(sycl::queue &q,
+                             std::vector<sycl::event> &host_task_events,
+                             Vs &&...vs)
+{
+
+    using dpctl::tensor::alloc_utils::usm_host_allocator;
+
+    // memory transfer optimization, use USM-host for temporary speeds up
+    // transfer to device, especially on dGPUs
+    using usm_host_allocatorT = usm_host_allocator<indT>;
+    using shT = std::vector<indT, usm_host_allocatorT>;
+
+    usm_host_allocatorT usm_host_alloc(q);
+    shT empty{0, usm_host_alloc};
+    shT packed_shape_strides = detail::concat(std::move(empty), vs...);
+
+    auto packed_shape_strides_owner =
+        std::make_shared<shT>(std::move(packed_shape_strides));
+
+    auto sz = packed_shape_strides_owner->size();
+    auto shape_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<indT>(sz, q);
+    indT *shape_strides = shape_strides_owner.get();
+
+    sycl::event copy_ev =
+        q.copy<indT>(packed_shape_strides_owner->data(), shape_strides, sz);
+
+    sycl::event cleanup_host_task_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(copy_ev);
+        cgh.host_task([packed_shape_strides_owner =
+                           std::move(packed_shape_strides_owner)] {
+            // increment shared pointer ref-count to keep it alive
+            // till copy operation completes;
+        });
+    });
+    host_task_events.push_back(cleanup_host_task_ev);
+
+    return std::make_tuple(std::move(shape_strides_owner), sz, copy_ev);
+}
+
+struct NoOpIndexer
+{
+    constexpr NoOpIndexer() {}
+    constexpr std::size_t operator()(std::size_t gid) const { return gid; }
+};
+
+using dpctl::tensor::ssize_t;
+
+/* @brief Indexer with shape and strides arrays of same size are packed */
+struct StridedIndexer
+{
+    StridedIndexer(int _nd,
+                   ssize_t _offset,
+                   ssize_t const *_packed_shape_strides)
+        : nd(_nd), starting_offset(_offset),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    ssize_t operator()(ssize_t gid) const { return compute_offset(gid); }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        return compute_offset(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_offset;
+    ssize_t const *shape_strides;
+
+    ssize_t compute_offset(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_strides,      // shape ptr
+            shape_strides + nd, // strides ptr
+            relative_offset);
+        return starting_offset + relative_offset;
+    }
+};
+
+// ensure that indexer is device copyable
+static_assert(sycl::is_device_copyable_v<StridedIndexer>);
+
+/* @brief Indexer with shape, strides provided separately */
+struct UnpackedStridedIndexer
+{
+    UnpackedStridedIndexer(int _nd,
+                           ssize_t _offset,
+                           ssize_t const *_shape,
+                           ssize_t const *_strides)
+        : nd(_nd), starting_offset(_offset), shape(_shape), strides(_strides)
+    {
+    }
+
+    ssize_t operator()(ssize_t gid) const { return compute_offset(gid); }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        return compute_offset(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_offset;
+    ssize_t const *shape;
+    ssize_t const *strides;
+
+    ssize_t compute_offset(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape,   // shape ptr
+            strides, // strides ptr
+            relative_offset);
+        return starting_offset + relative_offset;
+    }
+};
+
+// ensure that indexer is device copyable
+static_assert(sycl::is_device_copyable_v<UnpackedStridedIndexer>);
+
+struct Strided1DIndexer
+{
+    Strided1DIndexer(std::size_t _size) : offset{}, size(_size), step(1) {}
+    Strided1DIndexer(ssize_t _size)
+        : offset{}, size(static_cast<std::size_t>(_size)), step(1)
+    {
+    }
+    Strided1DIndexer(std::size_t _size, ssize_t _step)
+        : offset{}, size(_size), step(_step)
+    {
+    }
+    Strided1DIndexer(std::size_t _size, std::size_t _step)
+        : offset{}, size(_size), step(static_cast<ssize_t>(_step))
+    {
+    }
+    Strided1DIndexer(ssize_t _size, ssize_t _step)
+        : offset{}, size(static_cast<std::size_t>(_size)), step(_step)
+    {
+    }
+    Strided1DIndexer(ssize_t _offset, std::size_t _size, ssize_t _step)
+        : offset(_offset), size(_size), step(_step)
+    {
+    }
+    Strided1DIndexer(ssize_t _offset, std::size_t _size, std::size_t _step)
+        : offset(_offset), size(_size), step(static_cast<ssize_t>(_step))
+    {
+    }
+    Strided1DIndexer(ssize_t _offset, ssize_t _size, ssize_t _step)
+        : offset(_offset), size(static_cast<std::size_t>(_size)), step(_step)
+    {
+    }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        // ensure 0 <= gid < size
+        return offset + std::min<std::size_t>(gid, size - 1) * step;
+    }
+
+private:
+    ssize_t offset = 0;
+    std::size_t size = 1;
+    ssize_t step = 1;
+};
+
+static_assert(sycl::is_device_copyable_v<Strided1DIndexer>);
+
+struct Strided1DCyclicIndexer
+{
+    Strided1DCyclicIndexer(ssize_t _offset, ssize_t _size, ssize_t _step)
+        : offset(_offset), size(static_cast<std::size_t>(_size)), step(_step)
+    {
+    }
+
+    ssize_t operator()(std::size_t gid) const
+    {
+        return offset + (gid % size) * step;
+    }
+
+private:
+    ssize_t offset = 0;
+    std::size_t size = 1;
+    ssize_t step = 1;
+};
+
+static_assert(sycl::is_device_copyable_v<Strided1DCyclicIndexer>);
+
+template <typename displacementT>
+struct TwoOffsets
+{
+    constexpr TwoOffsets() : first_offset(0), second_offset(0) {}
+    constexpr TwoOffsets(const displacementT &first_offset_,
+                         const displacementT &second_offset_)
+        : first_offset(first_offset_), second_offset(second_offset_)
+    {
+    }
+
+    constexpr displacementT get_first_offset() const { return first_offset; }
+    constexpr displacementT get_second_offset() const { return second_offset; }
+
+private:
+    displacementT first_offset = 0;
+    displacementT second_offset = 0;
+};
+
+struct TwoOffsets_StridedIndexer
+{
+    TwoOffsets_StridedIndexer(int common_nd,
+                              ssize_t first_offset_,
+                              ssize_t second_offset_,
+                              ssize_t const *_packed_shape_strides)
+        : nd(common_nd), starting_first_offset(first_offset_),
+          starting_second_offset(second_offset_),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    TwoOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return compute_offsets(gid);
+    }
+
+    TwoOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        return compute_offsets(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_first_offset;
+    ssize_t starting_second_offset;
+    ssize_t const *shape_strides;
+
+    TwoOffsets<ssize_t> compute_offsets(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_first_offset(0);
+        ssize_t relative_second_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_strides,          // shape ptr
+            shape_strides + nd,     // strides ptr
+            shape_strides + 2 * nd, // strides ptr
+            relative_first_offset, relative_second_offset);
+        return TwoOffsets<ssize_t>(
+            starting_first_offset + relative_first_offset,
+            starting_second_offset + relative_second_offset);
+    }
+};
+
+struct TwoZeroOffsets_Indexer
+{
+    constexpr TwoZeroOffsets_Indexer() {}
+
+    constexpr TwoOffsets<ssize_t> operator()(ssize_t) const
+    {
+        return TwoOffsets<ssize_t>();
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<TwoZeroOffsets_Indexer>);
+
+template <typename FirstIndexerT, typename SecondIndexerT>
+struct TwoOffsets_CombinedIndexer
+{
+private:
+    FirstIndexerT first_indexer_;
+    SecondIndexerT second_indexer_;
+
+public:
+    constexpr TwoOffsets_CombinedIndexer(const FirstIndexerT &first_indexer,
+                                         const SecondIndexerT &second_indexer)
+        : first_indexer_(first_indexer), second_indexer_(second_indexer)
+    {
+    }
+
+    constexpr TwoOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return TwoOffsets<ssize_t>(first_indexer_(gid), second_indexer_(gid));
+    }
+};
+
+template <typename displacementT>
+struct ThreeOffsets
+{
+    constexpr ThreeOffsets()
+        : first_offset(0), second_offset(0), third_offset(0)
+    {
+    }
+    constexpr ThreeOffsets(const displacementT &first_offset_,
+                           const displacementT &second_offset_,
+                           const displacementT &third_offset_)
+        : first_offset(first_offset_), second_offset(second_offset_),
+          third_offset(third_offset_)
+    {
+    }
+
+    constexpr displacementT get_first_offset() const { return first_offset; }
+    constexpr displacementT get_second_offset() const { return second_offset; }
+    constexpr displacementT get_third_offset() const { return third_offset; }
+
+private:
+    displacementT first_offset = 0;
+    displacementT second_offset = 0;
+    displacementT third_offset = 0;
+};
+
+struct ThreeOffsets_StridedIndexer
+{
+    ThreeOffsets_StridedIndexer(int common_nd,
+                                ssize_t first_offset_,
+                                ssize_t second_offset_,
+                                ssize_t third_offset_,
+                                ssize_t const *_packed_shape_strides)
+        : nd(common_nd), starting_first_offset(first_offset_),
+          starting_second_offset(second_offset_),
+          starting_third_offset(third_offset_),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    ThreeOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return compute_offsets(gid);
+    }
+
+    ThreeOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        return compute_offsets(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_first_offset;
+    ssize_t starting_second_offset;
+    ssize_t starting_third_offset;
+    ssize_t const *shape_strides;
+
+    ThreeOffsets<ssize_t> compute_offsets(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_first_offset(0);
+        ssize_t relative_second_offset(0);
+        ssize_t relative_third_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_strides,          // shape ptr
+            shape_strides + nd,     // strides ptr
+            shape_strides + 2 * nd, // strides ptr
+            shape_strides + 3 * nd, // strides ptr
+            relative_first_offset, relative_second_offset,
+            relative_third_offset);
+        return ThreeOffsets<ssize_t>(
+            starting_first_offset + relative_first_offset,
+            starting_second_offset + relative_second_offset,
+            starting_third_offset + relative_third_offset);
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<ThreeOffsets_StridedIndexer>);
+
+struct ThreeZeroOffsets_Indexer
+{
+    constexpr ThreeZeroOffsets_Indexer() {}
+
+    constexpr ThreeOffsets<ssize_t> operator()(ssize_t) const
+    {
+        return ThreeOffsets<ssize_t>();
+    }
+
+    constexpr ThreeOffsets<ssize_t> operator()(std::size_t) const
+    {
+        return ThreeOffsets<ssize_t>();
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<ThreeZeroOffsets_Indexer>);
+
+template <typename FirstIndexerT,
+          typename SecondIndexerT,
+          typename ThirdIndexerT>
+struct ThreeOffsets_CombinedIndexer
+{
+private:
+    FirstIndexerT first_indexer_;
+    SecondIndexerT second_indexer_;
+    ThirdIndexerT third_indexer_;
+
+public:
+    constexpr ThreeOffsets_CombinedIndexer(const FirstIndexerT &first_indexer,
+                                           const SecondIndexerT &second_indexer,
+                                           const ThirdIndexerT &third_indexer)
+        : first_indexer_(first_indexer), second_indexer_(second_indexer),
+          third_indexer_(third_indexer)
+    {
+    }
+
+    constexpr ThreeOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return ThreeOffsets<ssize_t>(first_indexer_(gid), second_indexer_(gid),
+                                     third_indexer_(gid));
+    }
+};
+
+template <typename displacementT>
+struct FourOffsets
+{
+    constexpr FourOffsets()
+        : first_offset(0), second_offset(0), third_offset(0), fourth_offset(0)
+    {
+    }
+    constexpr FourOffsets(const displacementT &first_offset_,
+                          const displacementT &second_offset_,
+                          const displacementT &third_offset_,
+                          const displacementT &fourth_offset_)
+        : first_offset(first_offset_), second_offset(second_offset_),
+          third_offset(third_offset_), fourth_offset(fourth_offset_)
+    {
+    }
+
+    constexpr displacementT get_first_offset() const { return first_offset; }
+    constexpr displacementT get_second_offset() const { return second_offset; }
+    constexpr displacementT get_third_offset() const { return third_offset; }
+    constexpr displacementT get_fourth_offset() const { return fourth_offset; }
+
+private:
+    displacementT first_offset = 0;
+    displacementT second_offset = 0;
+    displacementT third_offset = 0;
+    displacementT fourth_offset = 0;
+};
+
+struct FourOffsets_StridedIndexer
+{
+    constexpr FourOffsets_StridedIndexer(int common_nd,
+                                         ssize_t first_offset_,
+                                         ssize_t second_offset_,
+                                         ssize_t third_offset_,
+                                         ssize_t fourth_offset_,
+                                         ssize_t const *_packed_shape_strides)
+        : nd(common_nd), starting_first_offset(first_offset_),
+          starting_second_offset(second_offset_),
+          starting_third_offset(third_offset_),
+          starting_fourth_offset(fourth_offset_),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    constexpr FourOffsets<ssize_t> operator()(ssize_t gid) const
+    {
+        return compute_offsets(gid);
+    }
+
+    constexpr FourOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        return compute_offsets(static_cast<ssize_t>(gid));
+    }
+
+private:
+    int nd;
+    ssize_t starting_first_offset;
+    ssize_t starting_second_offset;
+    ssize_t starting_third_offset;
+    ssize_t starting_fourth_offset;
+    ssize_t const *shape_strides;
+
+    FourOffsets<ssize_t> compute_offsets(ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd);
+        ssize_t relative_first_offset(0);
+        ssize_t relative_second_offset(0);
+        ssize_t relative_third_offset(0);
+        ssize_t relative_fourth_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid,
+            shape_strides,          // shape ptr
+            shape_strides + nd,     // strides ptr
+            shape_strides + 2 * nd, // strides ptr
+            shape_strides + 3 * nd, // strides ptr
+            shape_strides + 4 * nd, // strides ptr
+            relative_first_offset, relative_second_offset,
+            relative_third_offset, relative_fourth_offset);
+        return FourOffsets<ssize_t>(
+            starting_first_offset + relative_first_offset,
+            starting_second_offset + relative_second_offset,
+            starting_third_offset + relative_third_offset,
+            starting_fourth_offset + relative_fourth_offset);
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<FourOffsets_StridedIndexer>);
+
+struct FourZeroOffsets_Indexer
+{
+    constexpr FourZeroOffsets_Indexer() {}
+
+    constexpr FourOffsets<ssize_t> operator()(ssize_t) const
+    {
+        return FourOffsets<ssize_t>();
+    }
+};
+
+static_assert(sycl::is_device_copyable_v<FourZeroOffsets_Indexer>);
+
+struct NthStrideOffset
+{
+    NthStrideOffset(int common_nd,
+                    ssize_t const *_offsets,
+                    ssize_t const *_packed_shape_strides)
+        : _ind(common_nd), nd(common_nd), offsets(_offsets),
+          shape_strides(_packed_shape_strides)
+    {
+    }
+
+    std::size_t operator()(ssize_t gid, int n) const
+    {
+        ssize_t relative_offset(0);
+        _ind.get_displacement<const ssize_t *, const ssize_t *>(
+            gid, shape_strides, shape_strides + ((n + 1) * nd),
+            relative_offset);
+
+        return relative_offset + offsets[n];
+    }
+
+private:
+    dpctl::tensor::strides::CIndexer_vector<ssize_t> _ind;
+
+    int nd;
+    ssize_t const *offsets;
+    ssize_t const *shape_strides;
+};
+
+static_assert(sycl::is_device_copyable_v<NthStrideOffset>);
+
+template <int nd>
+struct FixedDimStridedIndexer
+{
+    FixedDimStridedIndexer(const std::array<ssize_t, nd> &_shape,
+                           const std::array<ssize_t, nd> &_strides,
+                           ssize_t _offset)
+        : _ind(_shape), strides(_strides), starting_offset(_offset)
+    {
+    }
+    std::size_t operator()(std::size_t gid) const
+    {
+        dpctl::tensor::strides::CIndexer_array<nd, ssize_t> local_indexer(
+            std::move(_ind));
+        local_indexer.set(gid);
+        auto mi = local_indexer.get();
+
+        ssize_t relative_offset = 0;
+
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset += mi[i] * strides[i];
+        }
+        return starting_offset + relative_offset;
+    }
+
+private:
+    dpctl::tensor::strides::CIndexer_array<nd, ssize_t> _ind;
+
+    std::array<ssize_t, nd> strides;
+    ssize_t starting_offset;
+};
+
+static_assert(sycl::is_device_copyable_v<FixedDimStridedIndexer<1>>);
+
+template <int nd>
+struct TwoOffsets_FixedDimStridedIndexer
+{
+    TwoOffsets_FixedDimStridedIndexer(const std::array<ssize_t, nd> &_shape,
+                                      const std::array<ssize_t, nd> &_strides1,
+                                      const std::array<ssize_t, nd> &_strides2,
+                                      ssize_t _offset1,
+                                      ssize_t _offset2)
+        : _ind(_shape), strides1(_strides1), strides2(_strides2),
+          starting_offset1(_offset1), starting_offset2(_offset2)
+    {
+    }
+
+    TwoOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        dpctl::tensor::strides::CIndexer_array<nd, ssize_t> local_indexer(
+            std::move(_ind));
+        local_indexer.set(gid);
+        auto mi = local_indexer.get();
+
+        ssize_t relative_offset1 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset1 += mi[i] * strides1[i];
+        }
+
+        ssize_t relative_offset2 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset2 += mi[i] * strides2[i];
+        }
+
+        return TwoOffsets<ssize_t>(starting_offset1 + relative_offset1,
+                                   starting_offset2 + relative_offset2);
+    }
+
+private:
+    dpctl::tensor::strides::CIndexer_array<nd, ssize_t> _ind;
+
+    std::array<ssize_t, nd> strides1;
+    std::array<ssize_t, nd> strides2;
+    ssize_t starting_offset1;
+    ssize_t starting_offset2;
+};
+
+static_assert(sycl::is_device_copyable_v<TwoOffsets_FixedDimStridedIndexer<1>>);
+
+template <int nd>
+struct ThreeOffsets_FixedDimStridedIndexer
+{
+    ThreeOffsets_FixedDimStridedIndexer(
+        const std::array<ssize_t, nd> &_shape,
+        const std::array<ssize_t, nd> &_strides1,
+        const std::array<ssize_t, nd> &_strides2,
+        const std::array<ssize_t, nd> &_strides3,
+        ssize_t _offset1,
+        ssize_t _offset2,
+        ssize_t _offset3)
+        : _ind(_shape), strides1(_strides1), strides2(_strides2),
+          strides3(_strides3), starting_offset1(_offset1),
+          starting_offset2(_offset2), starting_offset3(_offset3)
+    {
+    }
+
+    ThreeOffsets<ssize_t> operator()(std::size_t gid) const
+    {
+        dpctl::tensor::strides::CIndexer_array<nd, ssize_t> local_indexer(
+            std::move(_ind));
+        local_indexer.set(gid);
+        auto mi = local_indexer.get();
+
+        ssize_t relative_offset1 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset1 += mi[i] * strides1[i];
+        }
+
+        ssize_t relative_offset2 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset2 += mi[i] * strides2[i];
+        }
+
+        ssize_t relative_offset3 = 0;
+#pragma unroll
+        for (int i = 0; i < nd; ++i) {
+            relative_offset3 += mi[i] * strides3[i];
+        }
+
+        return ThreeOffsets<ssize_t>(starting_offset1 + relative_offset1,
+                                     starting_offset2 + relative_offset2,
+                                     starting_offset3 + relative_offset3);
+    }
+
+private:
+    dpctl::tensor::strides::CIndexer_array<nd, ssize_t> _ind;
+
+    std::array<ssize_t, nd> strides1;
+    std::array<ssize_t, nd> strides2;
+    std::array<ssize_t, nd> strides3;
+    ssize_t starting_offset1;
+    ssize_t starting_offset2;
+    ssize_t starting_offset3;
+};
+
+static_assert(
+    sycl::is_device_copyable_v<ThreeOffsets_FixedDimStridedIndexer<1>>);
+} // namespace dpctl::tensor::offset_utils
diff --git a/dpnp/tensor/libtensor/include/utils/output_validation.hpp b/dpnp/tensor/libtensor/include/utils/output_validation.hpp
new file mode 100644
index 000000000000..26f1b29bd3d8
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/utils/output_validation.hpp
@@ -0,0 +1,79 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utilities for determining if an array is a valid output
+/// array.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <stdexcept>
+
+#include <pybind11/pybind11.h>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::validation
+{
+namespace py = pybind11;
+
+/*! @brief Raises a value error if an array is read-only.
+
+    This should be called with an array before writing.*/
+struct CheckWritable
+{
+    static void throw_if_not_writable(const dpctl::tensor::usm_ndarray &arr)
+    {
+        if (!arr.is_writable()) {
+            throw py::value_error("output array is read-only.");
+        }
+        return;
+    }
+};
+
+/*! @brief Raises a value error if an array's memory is not sufficiently ample
+    to accommodate an input number of elements.
+
+    This should be called with an array before writing.*/
+struct AmpleMemory
+{
+    template <typename T>
+    static void throw_if_not_ample(const dpctl::tensor::usm_ndarray &arr,
+                                   T nelems)
+    {
+        auto arr_offsets = arr.get_minmax_offsets();
+        T range = static_cast<T>(arr_offsets.second - arr_offsets.first);
+        if (range + 1 < nelems) {
+            throw py::value_error("Memory addressed by the output array is not "
+                                  "sufficiently ample.");
+        }
+        return;
+    }
+};
+} // namespace dpctl::tensor::validation
diff --git a/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp b/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp
new file mode 100644
index 000000000000..5d03294392d8
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/utils/rich_comparisons.hpp
@@ -0,0 +1,149 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cmath>
+#include <complex>
+#include <functional>
+#include <type_traits>
+
+#include "sycl/sycl.hpp"
+
+namespace dpctl::tensor::rich_comparisons
+{
+
+namespace detail
+{
+template <typename fpT>
+struct ExtendedRealFPLess
+{
+    /* [R, nan] */
+    bool operator()(const fpT v1, const fpT v2) const
+    {
+        return (!std::isnan(v1) && (std::isnan(v2) || (v1 < v2)));
+    }
+};
+
+template <typename fpT>
+struct ExtendedRealFPGreater
+{
+    bool operator()(const fpT v1, const fpT v2) const
+    {
+        return (!std::isnan(v2) && (std::isnan(v1) || (v2 < v1)));
+    }
+};
+
+template <typename cT>
+struct ExtendedComplexFPLess
+{
+    /* [(R, R), (R, nan), (nan, R), (nan, nan)] */
+
+    bool operator()(const cT &v1, const cT &v2) const
+    {
+        using realT = typename cT::value_type;
+
+        const realT real1 = std::real(v1);
+        const realT real2 = std::real(v2);
+
+        const bool r1_nan = std::isnan(real1);
+        const bool r2_nan = std::isnan(real2);
+
+        const realT imag1 = std::imag(v1);
+        const realT imag2 = std::imag(v2);
+
+        const bool i1_nan = std::isnan(imag1);
+        const bool i2_nan = std::isnan(imag2);
+
+        const int idx1 = ((r1_nan) ? 2 : 0) + ((i1_nan) ? 1 : 0);
+        const int idx2 = ((r2_nan) ? 2 : 0) + ((i2_nan) ? 1 : 0);
+
+        const bool res =
+            !(r1_nan && i1_nan) &&
+            ((idx1 < idx2) ||
+             ((idx1 == idx2) &&
+              ((r1_nan && !i1_nan && (imag1 < imag2)) ||
+               (!r1_nan && i1_nan && (real1 < real2)) ||
+               (!r1_nan && !i1_nan &&
+                ((real1 < real2) || (!(real2 < real1) && (imag1 < imag2)))))));
+
+        return res;
+    }
+};
+
+template <typename cT>
+struct ExtendedComplexFPGreater
+{
+    bool operator()(const cT &v1, const cT &v2) const
+    {
+        auto less_ = ExtendedComplexFPLess<cT>{};
+        return less_(v2, v1);
+    }
+};
+
+template <typename T>
+inline constexpr bool is_fp_v =
+    (std::is_same_v<T, sycl::half> || std::is_same_v<T, float> ||
+     std::is_same_v<T, double>);
+
+} // namespace detail
+
+template <typename argTy>
+struct AscendingSorter
+{
+    using type = std::conditional_t<detail::is_fp_v<argTy>,
+                                    detail::ExtendedRealFPLess<argTy>,
+                                    std::less<argTy>>;
+};
+
+template <typename T>
+struct AscendingSorter<std::complex<T>>
+{
+    using type = detail::ExtendedComplexFPLess<std::complex<T>>;
+};
+
+template <typename argTy>
+struct DescendingSorter
+{
+    using type = std::conditional_t<detail::is_fp_v<argTy>,
+                                    detail::ExtendedRealFPGreater<argTy>,
+                                    std::greater<argTy>>;
+};
+
+template <typename T>
+struct DescendingSorter<std::complex<T>>
+{
+    using type = detail::ExtendedComplexFPGreater<std::complex<T>>;
+};
+
+} // namespace dpctl::tensor::rich_comparisons
diff --git a/dpnp/tensor/libtensor/include/utils/strided_iters.hpp b/dpnp/tensor/libtensor/include/utils/strided_iters.hpp
new file mode 100644
index 000000000000..65250b755b56
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/utils/strided_iters.hpp
@@ -0,0 +1,984 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines CIndexer_array, and CIndexer_vector classes, as well
+/// iteration space simplifiers.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <numeric>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+namespace dpctl::tensor::strides
+{
+/* An N-dimensional array can be stored in a single
+ * contiguous chunk of memory by contiguously laying
+ * array elements in lexicographinc order of their
+ * array indices. Such a layout is called C-contiguous.
+ *
+ * E.g. for (2, 3, 2) array `a` with zero-based indexing convention
+ * the C-array's elements are
+ * { a[0,0,0], a[0,0,1], a[0,1,0], a[0,1,1], a[0,2,0], a[0,2,1],
+ *   a[1,0,0], a[1,0,1], a[1,1,0], a[1,1,1], a[1,2,0], a[1,2,1] }
+ *
+ * Indexer maps zero-based index in C-array to a multi-index
+ * for the purpose of computing element displacement in the
+ * strided array, i.e. in the above example for k = 5, the displacement
+ * is (s0*0 + s1*2 + s2*1), and for k = 7 it is (s0*1 + s1*0 + s2*1)
+ * for N-dimensional array with strides (s0, s1, s2).
+ *
+ * Cindexer_vector need not know array rank `dim` at compile time.
+ * Shape and strides are stored in std::vector, which are not trivially
+ * copyable.
+ *
+ * For the class to be trivially copyable for offloading displacement
+ * computation methods take accessor/pointer arguments of type T for
+ * shape and stride and modify displacement argument passed by reference.
+ */
+template <typename indT = std::ptrdiff_t>
+class CIndexer_vector
+{
+    static_assert(std::is_integral<indT>::value, "Integral type is required");
+    static_assert(std::is_signed<indT>::value,
+                  "Signed integral type is required");
+    int nd;
+
+public:
+    CIndexer_vector(int dim) : nd(dim) {}
+
+    template <class ShapeTy>
+    indT size(const ShapeTy &shape) const
+    {
+        indT s = static_cast<indT>(1);
+        for (int i = 0; i < nd; ++i) {
+            s *= shape[i];
+        }
+        return s;
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const StridesTy &stride,
+                          indT &disp) const
+    {
+        if (nd == 1) {
+            disp = i * stride[0];
+            return;
+        }
+
+        indT i_ = i;
+        indT d = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            d += r * stride[dim];
+            i_ = q;
+        }
+        disp = d + i_ * stride[0];
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const StridesTy &stride1,
+                          const StridesTy &stride2,
+                          indT &disp1,
+                          indT &disp2) const
+    {
+        if (nd == 1) {
+            disp1 = i * stride1[0];
+            disp2 = i * stride2[0];
+            return;
+        }
+
+        indT i_ = i;
+        indT d1 = 0, d2 = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            i_ = q;
+            d1 += r * stride1[dim];
+            d2 += r * stride2[dim];
+        }
+        disp1 = d1 + i_ * stride1[0];
+        disp2 = d2 + i_ * stride2[0];
+        return;
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const StridesTy &stride1,
+                          const StridesTy &stride2,
+                          const StridesTy &stride3,
+                          indT &disp1,
+                          indT &disp2,
+                          indT &disp3) const
+    {
+        if (nd == 1) {
+            disp1 = i * stride1[0];
+            disp2 = i * stride2[0];
+            disp3 = i * stride3[0];
+            return;
+        }
+
+        indT i_ = i;
+        indT d1 = 0, d2 = 0, d3 = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            i_ = q;
+            d1 += r * stride1[dim];
+            d2 += r * stride2[dim];
+            d3 += r * stride3[dim];
+        };
+        disp1 = d1 + i_ * stride1[0];
+        disp2 = d2 + i_ * stride2[0];
+        disp3 = d3 + i_ * stride3[0];
+        return;
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const StridesTy &stride1,
+                          const StridesTy &stride2,
+                          const StridesTy &stride3,
+                          const StridesTy &stride4,
+                          indT &disp1,
+                          indT &disp2,
+                          indT &disp3,
+                          indT &disp4) const
+    {
+        if (nd == 1) {
+            disp1 = i * stride1[0];
+            disp2 = i * stride2[0];
+            disp3 = i * stride3[0];
+            disp4 = i * stride4[0];
+            return;
+        }
+
+        indT i_ = i;
+        indT d1 = 0, d2 = 0, d3 = 0, d4 = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            i_ = q;
+            d1 += r * stride1[dim];
+            d2 += r * stride2[dim];
+            d3 += r * stride3[dim];
+            d4 += r * stride4[dim];
+        }
+        disp1 = d1 + i_ * stride1[0];
+        disp2 = d2 + i_ * stride2[0];
+        disp3 = d3 + i_ * stride3[0];
+        disp4 = d4 + i_ * stride4[0];
+        return;
+    }
+
+    template <class ShapeTy, class StridesTy, int nstrides>
+    void get_displacement(const indT i,
+                          const ShapeTy &shape,
+                          const std::array<StridesTy, nstrides> &strides,
+                          std::array<indT, nstrides> &disps) const
+    {
+        if (nd == 1) {
+            for (int k = 0; k < nstrides; ++k) {
+                disps[k] = i * strides[k][0];
+            }
+            return;
+        }
+
+        indT i_ = i;
+        std::array<indT, nstrides> ds;
+        for (int k = 0; k < nstrides; ++k) {
+            ds[k] = 0;
+        }
+
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            for (int k = 0; k < nstrides; ++k) {
+                ds[k] += r * strides[k][dim];
+            }
+            i_ = q;
+        };
+        for (int k = 0; k < nstrides; ++k) {
+            disps[k] = ds[k] + i_ * strides[k][0];
+        }
+        return;
+    }
+
+    template <class ShapeTy, class StridesTy>
+    void get_left_rolled_displacement(const indT i,
+                                      const ShapeTy &shape,
+                                      const StridesTy &stride,
+                                      const StridesTy &shifts,
+                                      indT &disp) const
+    {
+        indT i_ = i;
+        indT d(0);
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            // assumes si > shifts[dim] >= 0
+            const indT shifted_r =
+                (r < shifts[dim] ? r + si - shifts[dim] : r - shifts[dim]);
+            d += shifted_r * stride[dim];
+            i_ = q;
+        }
+        const indT shifted_r =
+            (i_ < shifts[0] ? i_ + shape[0] - shifts[0] : i_ - shifts[0]);
+        disp = d + shifted_r * stride[0];
+    }
+};
+
+/*
+ * CIndexer is for arrays whose array-rank is known at compile time.
+ * Statically allocated shape and multi_index arrays are members of
+ * the class instance, and it remains trivially copyable.
+ *
+ * Method `set(k)` populates work-item private array multi_index, which
+ * can be accessed using `get()` to compute the displacement as needed.
+ */
+
+template <int _ndim, typename indT = std::ptrdiff_t>
+class CIndexer_array
+{
+    static constexpr int ndim = _ndim;
+
+    static_assert(std::is_integral<indT>::value, "Integral type is required");
+    static_assert(std::is_signed<indT>::value,
+                  "Signed integral type is required");
+    static_assert(ndim > 0, "Dimensionality must be positive");
+
+private:
+    typedef std::array<indT, ndim> index_t;
+
+    indT elem_count;
+    index_t shape;
+    index_t multi_index;
+
+public:
+    CIndexer_array() : elem_count(0), shape{}, multi_index{} {}
+
+    explicit CIndexer_array(const index_t &input_shape)
+        : elem_count(0), shape{}, multi_index{}
+    {
+        indT s(1);
+        for (int i = 0; i < ndim; ++i) {
+            shape[i] = input_shape[i];
+            s *= input_shape[i];
+        }
+        elem_count = s;
+    }
+
+    indT size() const { return elem_count; }
+    indT rank() const { return ndim; }
+
+    void set(const indT i)
+    {
+        if (ndim == 1) {
+            multi_index[0] = i;
+            return;
+        }
+
+        indT i_ = i;
+#pragma unroll
+        for (int dim = ndim; --dim > 0;) {
+            indT si = shape[dim];
+            indT q = i_ / si;
+            multi_index[dim] = i_ - q * si;
+            i_ = q;
+        }
+        multi_index[0] = i_;
+    }
+
+    const index_t &get() const { return multi_index; }
+};
+
+/*
+    For purposes of iterating over elements of array with
+    `shape` and `strides` given as pointers
+    `simplify_iteration_strides(nd, shape_ptr, strides_ptr, disp)`
+    may modify memory and returns new length of these arrays.
+
+    The new shape and new strides, as well as the offset
+    `(new_shape, new_strides, disp)` are such that iterating over
+    them will traverse the same elements, possibly in
+    different order.
+
+    ..Example: python
+        import itertools
+        # for some array Y over whose elements we iterate
+        csh, cst, cp = contract_iter(Y.shape, Y.strides)
+        def pointers_set(sh, st, p):
+            citers = itertools.product(*map(lambda s: range(s), sh))
+            dot = lambda st, it: sum(st[k]*it[k] for k in range(len(st)))
+            return set(p + dot(st, it) for it in citers)
+        ps1 = pointers_set(csh, cst, cp)
+        ps2 = pointers_set(Y.shape, Y.strides, 0)
+        assert ps1 == ps2
+
+ */
+template <class ShapeTy, class StridesTy>
+int simplify_iteration_stride(const int nd,
+                              ShapeTy *shape,
+                              StridesTy *strides,
+                              StridesTy &disp)
+{
+    disp = StridesTy(0);
+    if (nd < 2)
+        return nd;
+
+    std::vector<int> pos(nd);
+    std::iota(pos.begin(), pos.end(), 0);
+
+    std::stable_sort(
+        pos.begin(), pos.end(), [&strides, &shape](int i1, int i2) {
+            auto abs_str1 = (strides[i1] < 0) ? -strides[i1] : strides[i1];
+            auto abs_str2 = (strides[i2] < 0) ? -strides[i2] : strides[i2];
+            return (abs_str1 > abs_str2) ||
+                   (abs_str1 == abs_str2 && shape[i1] > shape[i2]);
+        });
+
+    std::vector<ShapeTy> shape_w;
+    std::vector<StridesTy> strides_w;
+    int nd_ = nd;
+    shape_w.reserve(nd_);
+    strides_w.reserve(nd_);
+
+    for (int i = 0; i < nd; ++i) {
+        auto p = pos[i];
+        auto sh_p = shape[p];
+        auto str_p = strides[p];
+        shape_w.push_back(sh_p);
+        if (str_p < 0) {
+            disp += str_p * (sh_p - 1);
+            str_p = -str_p;
+        }
+        strides_w.push_back(str_p);
+    }
+
+    {
+        bool changed;
+        do {
+            changed = false;
+            for (int i = 0; i + 1 < nd_; ++i) {
+                StridesTy step = strides_w[i + 1];
+                StridesTy jump = strides_w[i] - (shape_w[i + 1] - 1) * step;
+                if (jump == step) {
+                    changed = true;
+                    for (int k = i; k + 1 < nd_; ++k) {
+                        strides_w[k] = strides_w[k + 1];
+                    }
+                    shape_w[i] *= shape_w[i + 1];
+                    for (int k = i + 1; k + 1 < nd_; ++k) {
+                        shape_w[k] = shape_w[k + 1];
+                    }
+                    --nd_;
+                }
+            }
+        } while (changed);
+    }
+
+    for (int i = 0; i < nd_; ++i) {
+        shape[i] = shape_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides[i] = strides_w[i];
+    }
+
+    return nd_;
+}
+
+/*
+    For purposes of iterating over pairs of elements of two arrays
+    with  `shape` and strides `strides1`, `strides2` given as pointers
+    `simplify_iteration_two_strides(nd, shape_ptr, strides1_ptr,
+    strides2_ptr, disp1, disp2)`
+    may modify memory and returns new length of these arrays.
+
+    The new shape and new strides, as well as the offset
+    `(new_shape, new_strides1, disp1, new_stride2, disp2)` are such that
+    iterating over them will traverse the same set of pairs of elements,
+    possibly in a different order.
+ */
+template <class ShapeTy, class StridesTy>
+int simplify_iteration_two_strides(const int nd,
+                                   ShapeTy *shape,
+                                   StridesTy *strides1,
+                                   StridesTy *strides2,
+                                   StridesTy &disp1,
+                                   StridesTy &disp2)
+{
+    disp1 = StridesTy(0);
+    disp2 = StridesTy(0);
+    if (nd < 2)
+        return nd;
+
+    std::vector<int> pos(nd);
+    std::iota(pos.begin(), pos.end(), 0);
+
+    std::stable_sort(
+        pos.begin(), pos.end(), [&strides1, &strides2, &shape](int i1, int i2) {
+            auto abs_str1_i1 =
+                (strides1[i1] < 0) ? -strides1[i1] : strides1[i1];
+            auto abs_str1_i2 =
+                (strides1[i2] < 0) ? -strides1[i2] : strides1[i2];
+            auto abs_str2_i1 =
+                (strides2[i1] < 0) ? -strides2[i1] : strides2[i1];
+            auto abs_str2_i2 =
+                (strides2[i2] < 0) ? -strides2[i2] : strides2[i2];
+            return (abs_str2_i1 > abs_str2_i2) ||
+                   (abs_str2_i1 == abs_str2_i2 &&
+                    (abs_str1_i1 > abs_str1_i2 ||
+                     (abs_str1_i1 == abs_str1_i2 && shape[i1] > shape[i2])));
+        });
+
+    std::vector<ShapeTy> shape_w;
+    std::vector<StridesTy> strides1_w;
+    std::vector<StridesTy> strides2_w;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        auto p = pos[i];
+        auto sh_p = shape[p];
+        auto str1_p = strides1[p];
+        auto str2_p = strides2[p];
+        shape_w.push_back(sh_p);
+        if (str1_p <= 0 && str2_p <= 0 && std::min(str1_p, str2_p) < 0) {
+            disp1 += str1_p * (sh_p - 1);
+            str1_p = -str1_p;
+            disp2 += str2_p * (sh_p - 1);
+            str2_p = -str2_p;
+        }
+        if (str1_p < 0 || str2_p < 0) {
+            contractable = false;
+        }
+        strides1_w.push_back(str1_p);
+        strides2_w.push_back(str2_p);
+    }
+
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str1 = strides1_w[i + 1];
+            StridesTy str2 = strides2_w[i + 1];
+            StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1;
+            StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2;
+
+            if (jump1 == str1 && jump2 == str2) {
+                changed = true;
+                shape_w[i] *= shape_w[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides1_w[j] = strides1_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides2_w[j] = strides2_w[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape_w[j] = shape_w[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+    for (int i = 0; i < nd_; ++i) {
+        shape[i] = shape_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides1[i] = strides1_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides2[i] = strides2_w[i];
+    }
+
+    return nd_;
+}
+
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T> contract_iter(const vecT &shape, const vecT &strides)
+{
+    const std::size_t dim = shape.size();
+    if (dim != strides.size()) {
+        throw Error("Shape and strides must be of equal size.");
+    }
+    vecT out_shape = shape;
+    vecT out_strides = strides;
+    T disp(0);
+
+    int nd = simplify_iteration_stride(dim, out_shape.data(),
+                                       out_strides.data(), disp);
+    out_shape.resize(nd);
+    out_strides.resize(nd);
+    return std::make_tuple(out_shape, out_strides, disp);
+}
+
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T, vecT, T> contract_iter2(const vecT &shape,
+                                                  const vecT &strides1,
+                                                  const vecT &strides2)
+{
+    const std::size_t dim = shape.size();
+    if (dim != strides1.size() || dim != strides2.size()) {
+        throw Error("Shape and strides must be of equal size.");
+    }
+    vecT out_shape = shape;
+    vecT out_strides1 = strides1;
+    vecT out_strides2 = strides2;
+    T disp1(0);
+    T disp2(0);
+
+    int nd = simplify_iteration_two_strides(dim, out_shape.data(),
+                                            out_strides1.data(),
+                                            out_strides2.data(), disp1, disp2);
+    out_shape.resize(nd);
+    out_strides1.resize(nd);
+    out_strides2.resize(nd);
+    return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2);
+}
+
+/*
+    For purposes of iterating over pairs of elements of three arrays
+    with  `shape` and strides `strides1`, `strides2`, `strides3` given as
+    pointers `simplify_iteration_three_strides(nd, shape_ptr, strides1_ptr,
+    strides2_ptr, strides3_ptr, disp1, disp2, disp3)`
+    may modify memory and returns new length of these arrays.
+
+    The new shape and new strides, as well as the offset
+    `(new_shape, new_strides1, disp1, new_stride2, disp2, new_stride3, disp3)`
+    are such that iterating over them will traverse the same set of tuples of
+    elements, possibly in a different order.
+ */
+template <class ShapeTy, class StridesTy>
+int simplify_iteration_three_strides(const int nd,
+                                     ShapeTy *shape,
+                                     StridesTy *strides1,
+                                     StridesTy *strides2,
+                                     StridesTy *strides3,
+                                     StridesTy &disp1,
+                                     StridesTy &disp2,
+                                     StridesTy &disp3)
+{
+    disp1 = StridesTy(0);
+    disp2 = StridesTy(0);
+    if (nd < 2)
+        return nd;
+
+    std::vector<int> pos(nd);
+    std::iota(pos.begin(), pos.end(), 0);
+
+    std::stable_sort(pos.begin(), pos.end(),
+                     [&strides1, &strides2, &strides3, &shape](int i1, int i2) {
+                         auto abs_str1_i1 =
+                             (strides1[i1] < 0) ? -strides1[i1] : strides1[i1];
+                         auto abs_str1_i2 =
+                             (strides1[i2] < 0) ? -strides1[i2] : strides1[i2];
+                         auto abs_str2_i1 =
+                             (strides2[i1] < 0) ? -strides2[i1] : strides2[i1];
+                         auto abs_str2_i2 =
+                             (strides2[i2] < 0) ? -strides2[i2] : strides2[i2];
+                         auto abs_str3_i1 =
+                             (strides3[i1] < 0) ? -strides3[i1] : strides3[i1];
+                         auto abs_str3_i2 =
+                             (strides3[i2] < 0) ? -strides3[i2] : strides3[i2];
+                         return (abs_str3_i1 > abs_str3_i2) ||
+                                ((abs_str3_i1 == abs_str3_i2) &&
+                                 ((abs_str2_i1 > abs_str2_i2) ||
+                                  ((abs_str2_i1 == abs_str2_i2) &&
+                                   ((abs_str1_i1 > abs_str1_i2) ||
+                                    ((abs_str1_i1 == abs_str1_i2) &&
+                                     (shape[i1] > shape[i2]))))));
+                     });
+
+    std::vector<ShapeTy> shape_w;
+    std::vector<StridesTy> strides1_w;
+    std::vector<StridesTy> strides2_w;
+    std::vector<StridesTy> strides3_w;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        auto p = pos[i];
+        auto sh_p = shape[p];
+        auto str1_p = strides1[p];
+        auto str2_p = strides2[p];
+        auto str3_p = strides3[p];
+        shape_w.push_back(sh_p);
+        if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 &&
+            std::min({str1_p, str2_p, str3_p}) < 0) {
+            disp1 += str1_p * (sh_p - 1);
+            str1_p = -str1_p;
+            disp2 += str2_p * (sh_p - 1);
+            str2_p = -str2_p;
+            disp3 += str3_p * (sh_p - 1);
+            str3_p = -str3_p;
+        }
+        if (str1_p < 0 || str2_p < 0 || str3_p < 0) {
+            contractable = false;
+        }
+        strides1_w.push_back(str1_p);
+        strides2_w.push_back(str2_p);
+        strides3_w.push_back(str3_p);
+    }
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str1 = strides1_w[i + 1];
+            StridesTy str2 = strides2_w[i + 1];
+            StridesTy str3 = strides3_w[i + 1];
+            StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1;
+            StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2;
+            StridesTy jump3 = strides3_w[i] - (shape_w[i + 1] - 1) * str3;
+
+            if (jump1 == str1 && jump2 == str2 && jump3 == str3) {
+                changed = true;
+                shape_w[i] *= shape_w[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides1_w[j] = strides1_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides2_w[j] = strides2_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides3_w[j] = strides3_w[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape_w[j] = shape_w[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+    for (int i = 0; i < nd_; ++i) {
+        shape[i] = shape_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides1[i] = strides1_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides2[i] = strides2_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides3[i] = strides3_w[i];
+    }
+
+    return nd_;
+}
+
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T, vecT, T, vecT, T> contract_iter3(const vecT &shape,
+                                                           const vecT &strides1,
+                                                           const vecT &strides2,
+                                                           const vecT &strides3)
+{
+    const std::size_t dim = shape.size();
+    if (dim != strides1.size() || dim != strides2.size() ||
+        dim != strides3.size()) {
+        throw Error("Shape and strides must be of equal size.");
+    }
+    vecT out_shape = shape;
+    vecT out_strides1 = strides1;
+    vecT out_strides2 = strides2;
+    vecT out_strides3 = strides3;
+    T disp1(0);
+    T disp2(0);
+    T disp3(0);
+
+    int nd = simplify_iteration_three_strides(
+        dim, out_shape.data(), out_strides1.data(), out_strides2.data(),
+        out_strides3.data(), disp1, disp2, disp3);
+    out_shape.resize(nd);
+    out_strides1.resize(nd);
+    out_strides2.resize(nd);
+    out_strides3.resize(nd);
+    return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2,
+                           out_strides3, disp3);
+}
+
+/*
+    For purposes of iterating over pairs of elements of four arrays
+    with  `shape` and strides `strides1`, `strides2`, `strides3`,
+    `strides4` given as pointers `simplify_iteration_four_strides(nd,
+    shape_ptr, strides1_ptr, strides2_ptr, strides3_ptr, strides4_ptr,
+    disp1, disp2, disp3, disp4)` may modify memory and returns new
+    length of these arrays.
+
+    The new shape and new strides, as well as the offset
+    `(new_shape, new_strides1, disp1, new_stride2, disp2, new_stride3, disp3,
+    new_stride4, disp4)` are such that iterating over them will traverse the
+    same set of tuples of elements, possibly in a different order.
+ */
+template <class ShapeTy, class StridesTy>
+int simplify_iteration_four_strides(const int nd,
+                                    ShapeTy *shape,
+                                    StridesTy *strides1,
+                                    StridesTy *strides2,
+                                    StridesTy *strides3,
+                                    StridesTy *strides4,
+                                    StridesTy &disp1,
+                                    StridesTy &disp2,
+                                    StridesTy &disp3,
+                                    StridesTy &disp4)
+{
+    disp1 = StridesTy(0);
+    disp2 = StridesTy(0);
+    if (nd < 2)
+        return nd;
+
+    std::vector<int> pos(nd);
+    std::iota(pos.begin(), pos.end(), 0);
+
+    std::stable_sort(
+        pos.begin(), pos.end(),
+        [&strides1, &strides2, &strides3, &strides4, &shape](int i1, int i2) {
+            auto abs_str1_i1 =
+                (strides1[i1] < 0) ? -strides1[i1] : strides1[i1];
+            auto abs_str1_i2 =
+                (strides1[i2] < 0) ? -strides1[i2] : strides1[i2];
+            auto abs_str2_i1 =
+                (strides2[i1] < 0) ? -strides2[i1] : strides2[i1];
+            auto abs_str2_i2 =
+                (strides2[i2] < 0) ? -strides2[i2] : strides2[i2];
+            auto abs_str3_i1 =
+                (strides3[i1] < 0) ? -strides3[i1] : strides3[i1];
+            auto abs_str3_i2 =
+                (strides3[i2] < 0) ? -strides3[i2] : strides3[i2];
+            auto abs_str4_i1 =
+                (strides4[i1] < 0) ? -strides4[i1] : strides4[i1];
+            auto abs_str4_i2 =
+                (strides4[i2] < 0) ? -strides4[i2] : strides4[i2];
+            return (abs_str4_i1 > abs_str4_i2) ||
+                   ((abs_str4_i1 == abs_str4_i2) &&
+                    ((abs_str3_i1 > abs_str3_i2) ||
+                     ((abs_str3_i1 == abs_str3_i2) &&
+                      ((abs_str2_i1 > abs_str2_i2) ||
+                       ((abs_str2_i1 == abs_str2_i2) &&
+                        ((abs_str1_i1 > abs_str1_i2) ||
+                         ((abs_str1_i1 == abs_str1_i2) &&
+                          (shape[i1] > shape[i2]))))))));
+        });
+
+    std::vector<ShapeTy> shape_w;
+    std::vector<StridesTy> strides1_w;
+    std::vector<StridesTy> strides2_w;
+    std::vector<StridesTy> strides3_w;
+    std::vector<StridesTy> strides4_w;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        auto p = pos[i];
+        auto sh_p = shape[p];
+        auto str1_p = strides1[p];
+        auto str2_p = strides2[p];
+        auto str3_p = strides3[p];
+        auto str4_p = strides4[p];
+        shape_w.push_back(sh_p);
+        if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 && str4_p <= 0 &&
+            std::min({str1_p, str2_p, str3_p, str4_p}) < 0) {
+            disp1 += str1_p * (sh_p - 1);
+            str1_p = -str1_p;
+            disp2 += str2_p * (sh_p - 1);
+            str2_p = -str2_p;
+            disp3 += str3_p * (sh_p - 1);
+            str3_p = -str3_p;
+            disp4 += str4_p * (sh_p - 1);
+            str4_p = -str4_p;
+        }
+        if (str1_p < 0 || str2_p < 0 || str3_p < 0 || str4_p < 0) {
+            contractable = false;
+        }
+        strides1_w.push_back(str1_p);
+        strides2_w.push_back(str2_p);
+        strides3_w.push_back(str3_p);
+        strides4_w.push_back(str4_p);
+    }
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str1 = strides1_w[i + 1];
+            StridesTy str2 = strides2_w[i + 1];
+            StridesTy str3 = strides3_w[i + 1];
+            StridesTy str4 = strides4_w[i + 1];
+            StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1;
+            StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2;
+            StridesTy jump3 = strides3_w[i] - (shape_w[i + 1] - 1) * str3;
+            StridesTy jump4 = strides4_w[i] - (shape_w[i + 1] - 1) * str4;
+
+            if (jump1 == str1 && jump2 == str2 && jump3 == str3 &&
+                jump4 == str4) {
+                changed = true;
+                shape_w[i] *= shape_w[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides1_w[j] = strides1_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides2_w[j] = strides2_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides3_w[j] = strides3_w[j + 1];
+                }
+                for (int j = i; j < nd_; ++j) {
+                    strides4_w[j] = strides4_w[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape_w[j] = shape_w[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+    for (int i = 0; i < nd_; ++i) {
+        shape[i] = shape_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides1[i] = strides1_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides2[i] = strides2_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides3[i] = strides3_w[i];
+    }
+    for (int i = 0; i < nd_; ++i) {
+        strides4[i] = strides4_w[i];
+    }
+
+    return nd_;
+}
+
+template <typename T, class Error, typename vecT = std::vector<T>>
+std::tuple<vecT, vecT, T, vecT, T, vecT, T, vecT, T>
+    contract_iter4(const vecT &shape,
+                   const vecT &strides1,
+                   const vecT &strides2,
+                   const vecT &strides3,
+                   const vecT &strides4)
+{
+    const std::size_t dim = shape.size();
+    if (dim != strides1.size() || dim != strides2.size() ||
+        dim != strides3.size() || dim != strides4.size()) {
+        throw Error("Shape and strides must be of equal size.");
+    }
+    vecT out_shape = shape;
+    vecT out_strides1 = strides1;
+    vecT out_strides2 = strides2;
+    vecT out_strides3 = strides3;
+    vecT out_strides4 = strides4;
+    T disp1(0);
+    T disp2(0);
+    T disp3(0);
+    T disp4(0);
+
+    int nd = simplify_iteration_four_strides(
+        dim, out_shape.data(), out_strides1.data(), out_strides2.data(),
+        out_strides3.data(), out_strides4.data(), disp1, disp2, disp3, disp4);
+    out_shape.resize(nd);
+    out_strides1.resize(nd);
+    out_strides2.resize(nd);
+    out_strides3.resize(nd);
+    out_strides4.resize(nd);
+    return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2,
+                           out_strides3, disp3, out_strides4, disp4);
+}
+
+/*
+    For purposes of iterating over elements of an array with  `shape` and
+    strides `strides` given as pointers `compact_iteration(nd, shape, strides)`
+    may modify memory and returns the new length of the array.
+
+    The new shape and new strides `(new_shape, new_strides)` are such that
+    iterating over them will traverse the same elements in the same order,
+    possibly with reduced dimensionality.
+ */
+template <class ShapeTy, class StridesTy>
+int compact_iteration(const int nd, ShapeTy *shape, StridesTy *strides)
+{
+    if (nd < 2)
+        return nd;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        if (strides[i] < 0) {
+            contractable = false;
+        }
+    }
+
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str = strides[i + 1];
+            StridesTy jump = strides[i] - (shape[i + 1] - 1) * str;
+
+            if (jump == str) {
+                changed = true;
+                shape[i] *= shape[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides[j] = strides[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape[j] = shape[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+
+    return nd_;
+}
+} // namespace dpctl::tensor::strides
diff --git a/dpnp/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpnp/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
new file mode 100644
index 000000000000..76f0174b9fdf
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
@@ -0,0 +1,223 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines CIndexer_array, and CIndexer_vector classes, as well
+/// iteration space simplifiers.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <exception>
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::alloc_utils
+{
+template <typename T>
+class usm_host_allocator : public sycl::usm_allocator<T, sycl::usm::alloc::host>
+{
+public:
+    using baseT = sycl::usm_allocator<T, sycl::usm::alloc::host>;
+    using baseT::baseT;
+
+    template <typename U>
+    struct rebind
+    {
+        typedef usm_host_allocator<U> other;
+    };
+
+    void deallocate(T *ptr, std::size_t n)
+    {
+        try {
+            baseT::deallocate(ptr, n);
+        } catch (const std::exception &e) {
+            std::cerr
+                << "Exception caught in `usm_host_allocator::deallocate`: "
+                << e.what() << std::endl;
+        }
+    }
+};
+
+template <typename T>
+void sycl_free_noexcept(T *ptr, const sycl::context &ctx) noexcept
+{
+    try {
+        sycl::free(ptr, ctx);
+    } catch (const std::exception &e) {
+        std::cerr << "Call to sycl::free caught exception: " << e.what()
+                  << std::endl;
+    }
+}
+
+template <typename T>
+void sycl_free_noexcept(T *ptr, const sycl::queue &q) noexcept
+{
+    sycl_free_noexcept(ptr, q.get_context());
+}
+
+class USMDeleter
+{
+private:
+    sycl::context ctx_;
+
+public:
+    USMDeleter(const sycl::queue &q) : ctx_(q.get_context()) {}
+    USMDeleter(const sycl::context &ctx) : ctx_(ctx) {}
+
+    template <typename T>
+    void operator()(T *ptr) const
+    {
+        sycl_free_noexcept(ptr, ctx_);
+    }
+};
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+    smart_malloc(std::size_t count,
+                 const sycl::queue &q,
+                 sycl::usm::alloc kind,
+                 const sycl::property_list &propList = {})
+{
+    T *ptr = sycl::malloc<T>(count, q, kind, propList);
+    if (nullptr == ptr) {
+        throw std::runtime_error("Unable to allocate device_memory");
+    }
+
+    auto usm_deleter = USMDeleter(q);
+    return std::unique_ptr<T, USMDeleter>(ptr, usm_deleter);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+    smart_malloc_device(std::size_t count,
+                        const sycl::queue &q,
+                        const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::device, propList);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+    smart_malloc_shared(std::size_t count,
+                        const sycl::queue &q,
+                        const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::shared, propList);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+    smart_malloc_host(std::size_t count,
+                      const sycl::queue &q,
+                      const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::host, propList);
+}
+
+namespace detail
+{
+template <typename T>
+struct valid_smart_ptr : public std::false_type
+{
+};
+
+template <typename ValT, typename DeleterT>
+struct valid_smart_ptr<std::unique_ptr<ValT, DeleterT> &>
+    : public std::is_same<DeleterT, USMDeleter>
+{
+};
+
+template <typename ValT, typename DeleterT>
+struct valid_smart_ptr<std::unique_ptr<ValT, DeleterT>>
+    : public std::is_same<DeleterT, USMDeleter>
+{
+};
+
+// base case
+template <typename... Rest>
+struct all_valid_smart_ptrs
+{
+    static constexpr bool value = true;
+};
+
+template <typename Arg, typename... RestArgs>
+struct all_valid_smart_ptrs<Arg, RestArgs...>
+{
+    static constexpr bool value = valid_smart_ptr<Arg>::value &&
+                                  (all_valid_smart_ptrs<RestArgs...>::value);
+};
+} // end of namespace detail
+
+/*! @brief Submit host_task and transfer ownership from smart pointers to it */
+template <typename... UniquePtrTs>
+sycl::event async_smart_free(sycl::queue &exec_q,
+                             const std::vector<sycl::event> &depends,
+                             UniquePtrTs &&...unique_pointers)
+{
+    static constexpr std::size_t n = sizeof...(UniquePtrTs);
+    static_assert(
+        n > 0, "async_smart_free requires at least one smart pointer argument");
+
+    static_assert(
+        detail::all_valid_smart_ptrs<UniquePtrTs...>::value,
+        "async_smart_free requires unique_ptr created with smart_malloc");
+
+    std::vector<void *> ptrs;
+    ptrs.reserve(n);
+    (ptrs.push_back(reinterpret_cast<void *>(unique_pointers.get())), ...);
+
+    std::vector<USMDeleter> dels;
+    dels.reserve(n);
+    (dels.emplace_back(unique_pointers.get_deleter()), ...);
+
+    sycl::event ht_e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.host_task([ptrs = std::move(ptrs), dels = std::move(dels)]() {
+            for (std::size_t i = 0; i < ptrs.size(); ++i) {
+                dels[i](ptrs[i]);
+            }
+        });
+    });
+
+    // Upon successful submission of host_task, USM allocations are owned
+    // by the host_task. Release smart pointer ownership to avoid double
+    // deallocation
+    (unique_pointers.release(), ...);
+
+    return ht_e;
+}
+} // namespace dpctl::tensor::alloc_utils
diff --git a/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp b/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp
new file mode 100644
index 000000000000..9ae41e5ade6e
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -0,0 +1,674 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines utilities used for kernel submission.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "math_utils.hpp"
+
+namespace dpctl::tensor::sycl_utils
+{
+namespace detail
+{
+template <typename...>
+struct TypeList;
+
+template <typename Head, typename... Tail>
+struct TypeList<Head, Tail...>
+{
+    using head = Head;
+    using tail = TypeList<Tail...>;
+};
+
+using NullTypeList = TypeList<>;
+template <typename T>
+struct IsNullTypeList : std::conditional_t<std::is_same_v<T, NullTypeList>,
+                                           std::true_type,
+                                           std::false_type>
+{
+};
+
+// recursively check if type is contained in given TypeList
+template <typename T, typename TList>
+struct IsContained
+    : std::conditional_t<
+          std::is_same_v<typename TList::head, std::remove_cv_t<T>>,
+          std::true_type,
+          IsContained<T, typename TList::tail>>
+{
+};
+
+template <>
+struct TypeList<>
+{
+};
+
+// std::false_type when last case has been checked for membership
+template <typename T>
+struct IsContained<T, NullTypeList> : std::false_type
+{
+};
+
+template <class T>
+struct IsComplex : std::false_type
+{
+};
+template <class T>
+struct IsComplex<std::complex<T>> : std::true_type
+{
+};
+} // namespace detail
+
+template <typename T>
+using sycl_ops = detail::TypeList<sycl::plus<T>,
+                                  sycl::bit_or<T>,
+                                  sycl::bit_xor<T>,
+                                  sycl::bit_and<T>,
+                                  sycl::maximum<T>,
+                                  sycl::minimum<T>,
+                                  sycl::multiplies<T>>;
+
+template <typename T, typename Op>
+struct IsSyclOp
+{
+    static constexpr bool value =
+        detail::IsContained<Op, sycl_ops<std::remove_const_t<T>>>::value ||
+        detail::IsContained<Op, sycl_ops<std::add_const_t<T>>>::value;
+};
+
+/*! @brief Find the smallest multiple of supported sub-group size larger than
+ * nelems */
+template <std::size_t f = 4>
+std::size_t choose_workgroup_size(const std::size_t nelems,
+                                  const std::vector<std::size_t> &sg_sizes)
+{
+    std::vector<std::size_t> wg_choices;
+    wg_choices.reserve(f * sg_sizes.size());
+
+    for (const auto &sg_size : sg_sizes) {
+#pragma unroll
+        for (std::size_t i = 1; i <= f; ++i) {
+            wg_choices.push_back(sg_size * i);
+        }
+    }
+    std::sort(std::begin(wg_choices), std::end(wg_choices));
+
+    std::size_t wg = 1;
+    for (std::size_t i = 0; i < wg_choices.size(); ++i) {
+        if (wg_choices[i] == wg) {
+            continue;
+        }
+        wg = wg_choices[i];
+        std::size_t n_groups = ((nelems + wg - 1) / wg);
+        if (n_groups == 1)
+            break;
+    }
+
+    return wg;
+}
+
+namespace detail
+{
+
+template <typename LocAccT, typename OpT>
+void _fold(LocAccT &local_mem_acc,
+           const std::uint32_t lid,
+           const std::uint32_t cutoff,
+           const std::uint32_t step,
+           const OpT &op)
+{
+    if (lid < cutoff) {
+        local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]);
+    }
+}
+
+template <typename LocAccT, typename OpT>
+void _fold(LocAccT &local_mem_acc,
+           const std::uint32_t lid,
+           const std::uint32_t step,
+           const OpT &op)
+{
+    if (lid < step) {
+        local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]);
+    }
+}
+
+} // end of namespace detail
+
+template <typename T, typename GroupT, typename LocAccT, typename OpT>
+T custom_reduce_over_group(const GroupT &wg,
+                           LocAccT local_mem_acc,
+                           const T &local_val,
+                           const OpT &op)
+{
+    // value experimentally tuned to achieve best runtime on Iris Xe,
+    // Arc A140V integrated Intel GPUs, and discrete Intel Max GPU.
+    static constexpr std::uint32_t low_sz = 8u;
+    // maximal work-group size
+    static constexpr std::uint32_t high_sz = 1024u;
+    const std::uint32_t wgs = wg.get_local_linear_range();
+    const std::uint32_t lid = wg.get_local_linear_id();
+
+    local_mem_acc[lid] = local_val;
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    std::uint32_t n_witems = wgs;
+    if (wgs & (wgs - 1)) {
+        // wgs is not a power of 2
+#pragma unroll
+        for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) {
+            if (n_witems >= sz) {
+                const std::uint32_t n_witems_ = (n_witems + 1) >> 1;
+                detail::_fold(local_mem_acc, lid, n_witems - n_witems_,
+                              n_witems_, op);
+                sycl::group_barrier(wg, sycl::memory_scope::work_group);
+                n_witems = n_witems_;
+            }
+        }
+    }
+    else {
+        // wgs is a power of 2
+#pragma unroll
+        for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) {
+            if (n_witems >= sz) {
+                n_witems >>= 1;
+                detail::_fold(local_mem_acc, lid, n_witems, op);
+                sycl::group_barrier(wg, sycl::memory_scope::work_group);
+            }
+        }
+    }
+
+    T red_val_over_wg = local_mem_acc[0];
+    if (wg.leader()) {
+        for (std::uint32_t i = 1; i < n_witems; ++i) {
+            red_val_over_wg = op(red_val_over_wg, local_mem_acc[i]);
+        }
+    }
+
+    return sycl::group_broadcast(wg, red_val_over_wg, 0);
+}
+
+template <typename GroupT,
+          typename SubGroupT,
+          typename LocAccT,
+          typename T,
+          typename OpT>
+T custom_inclusive_scan_over_group(GroupT &&wg,
+                                   SubGroupT &&sg,
+                                   LocAccT &&local_mem_acc,
+                                   const T &local_val,
+                                   const T &identity,
+                                   OpT &&op)
+{
+    const std::uint32_t local_id = wg.get_local_id(0);
+    const std::uint32_t wgs = wg.get_local_range(0);
+
+    const std::uint32_t lane_id = sg.get_local_id()[0];
+    const std::uint32_t sgSize = sg.get_local_range()[0];
+
+    T scan_val = local_val;
+    for (std::uint32_t step = 1; step < sgSize; step *= 2) {
+        const bool advanced_lane = (lane_id >= step);
+        const std::uint32_t src_lane_id =
+            (advanced_lane ? lane_id - step : lane_id);
+        const T modifier = sycl::select_from_group(sg, scan_val, src_lane_id);
+        if (advanced_lane) {
+            scan_val = op(scan_val, modifier);
+        }
+    }
+
+    local_mem_acc[local_id] = scan_val;
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    const std::uint32_t max_sgSize = sg.get_max_local_range()[0];
+    const std::uint32_t sgr_id = sg.get_group_id()[0];
+
+    // now scan
+    const std::uint32_t n_aggregates = 1 + ((wgs - 1) / max_sgSize);
+    const bool large_wg = (n_aggregates > max_sgSize);
+    if (large_wg) {
+        if (wg.leader()) {
+            T _scan_val = identity;
+            for (std::uint32_t i = 1; i <= n_aggregates - max_sgSize; ++i) {
+                _scan_val = op(local_mem_acc[i * max_sgSize - 1], _scan_val);
+                local_mem_acc[i * max_sgSize - 1] = _scan_val;
+            }
+        }
+        sycl::group_barrier(wg, sycl::memory_scope::work_group);
+    }
+
+    if (sgr_id == 0) {
+        const std::uint32_t offset =
+            (large_wg) ? n_aggregates - max_sgSize : 0u;
+        const bool in_range = (lane_id < n_aggregates);
+        const bool in_bounds = in_range && (lane_id > 0 || large_wg);
+
+        // Here is a bug where IGC incorrectly optimized the below code:
+        // T __scan_val = (in_bounds)
+        //                 ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
+        //                 : identity;
+        // That causes `__scan_val` is not initialized with `identity` value:
+        //   wgs = 256, max_sgSize = 16   =>   n_aggregates = 16
+        //   wi = 0:   in_range = 1, in_bounds = 0   =>   __scan_val = identity
+        // The w/s adds SYCL atomic fence, since the explicit memory fence
+        // prevents reordering/elimination, while it will add slight overhead.
+        T __scan_val = identity;
+        sycl::atomic_fence(sycl::memory_order::relaxed,
+                           sycl::memory_scope::work_item);
+        if (in_bounds) {
+            __scan_val = local_mem_acc[(offset + lane_id) * max_sgSize - 1];
+        }
+        for (std::uint32_t step = 1; step < sgSize; step *= 2) {
+            const bool advanced_lane = (lane_id >= step);
+            const std::uint32_t src_lane_id =
+                (advanced_lane ? lane_id - step : lane_id);
+            const T modifier =
+                sycl::select_from_group(sg, __scan_val, src_lane_id);
+            if (advanced_lane && in_range) {
+                __scan_val = op(__scan_val, modifier);
+            }
+        }
+        if (in_bounds) {
+            local_mem_acc[(offset + lane_id) * max_sgSize - 1] = __scan_val;
+        }
+    }
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    if (sgr_id > 0) {
+        const T modifier = local_mem_acc[sgr_id * max_sgSize - 1];
+        scan_val = op(scan_val, modifier);
+    }
+
+    // ensure all work-items finished reading from SLM
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    return scan_val;
+}
+
+// Reduction functors
+
+// Maximum
+
+template <typename T>
+struct Maximum
+{
+    T operator()(const T &x, const T &y) const
+    {
+        if constexpr (detail::IsComplex<T>::value) {
+            using dpctl::tensor::math_utils::max_complex;
+            return max_complex<T>(x, y);
+        }
+        else if constexpr (std::is_floating_point_v<T> ||
+                           std::is_same_v<T, sycl::half>) {
+            return (std::isnan(x) || x > y) ? x : y;
+        }
+        else if constexpr (std::is_same_v<T, bool>) {
+            return x || y;
+        }
+        else {
+            return (x > y) ? x : y;
+        }
+    }
+};
+
+// Minimum
+
+template <typename T>
+struct Minimum
+{
+    T operator()(const T &x, const T &y) const
+    {
+        if constexpr (detail::IsComplex<T>::value) {
+            using dpctl::tensor::math_utils::min_complex;
+            return min_complex<T>(x, y);
+        }
+        else if constexpr (std::is_floating_point_v<T> ||
+                           std::is_same_v<T, sycl::half>) {
+            return (std::isnan(x) || x < y) ? x : y;
+        }
+        else if constexpr (std::is_same_v<T, bool>) {
+            return x && y;
+        }
+        else {
+            return (x < y) ? x : y;
+        }
+    }
+};
+
+// Define identities and operator checking structs
+
+template <typename Op, typename T, typename = void>
+struct GetIdentity
+{
+};
+
+// Maximum
+
+template <typename T, class Op>
+using IsMaximum = std::bool_constant<std::is_same_v<Op, sycl::maximum<T>> ||
+                                     std::is_same_v<Op, Maximum<T>>>;
+
+template <typename T, class Op>
+using IsSyclMaximum = std::bool_constant<std::is_same_v<Op, sycl::maximum<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMaximum<T, Op>::value>>
+{
+    static constexpr T value =
+        static_cast<T>(std::numeric_limits<T>::has_infinity
+                           ? static_cast<T>(-std::numeric_limits<T>::infinity())
+                           : std::numeric_limits<T>::lowest());
+};
+
+template <typename Op>
+struct GetIdentity<Op, bool, std::enable_if_t<IsMaximum<bool, Op>::value>>
+{
+    static constexpr bool value = false;
+};
+
+template <typename Op, typename T>
+struct GetIdentity<Op,
+                   std::complex<T>,
+                   std::enable_if_t<IsMaximum<std::complex<T>, Op>::value>>
+{
+    static constexpr std::complex<T> value{-std::numeric_limits<T>::infinity(),
+                                           -std::numeric_limits<T>::infinity()};
+};
+
+// Minimum
+
+template <typename T, class Op>
+using IsMinimum = std::bool_constant<std::is_same_v<Op, sycl::minimum<T>> ||
+                                     std::is_same_v<Op, Minimum<T>>>;
+
+template <typename T, class Op>
+using IsSyclMinimum = std::bool_constant<std::is_same_v<Op, sycl::minimum<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMinimum<T, Op>::value>>
+{
+    static constexpr T value =
+        static_cast<T>(std::numeric_limits<T>::has_infinity
+                           ? static_cast<T>(std::numeric_limits<T>::infinity())
+                           : std::numeric_limits<T>::max());
+};
+
+template <typename Op>
+struct GetIdentity<Op, bool, std::enable_if_t<IsMinimum<bool, Op>::value>>
+{
+    static constexpr bool value = true;
+};
+
+template <typename Op, typename T>
+struct GetIdentity<Op,
+                   std::complex<T>,
+                   std::enable_if_t<IsMinimum<std::complex<T>, Op>::value>>
+{
+    static constexpr std::complex<T> value{std::numeric_limits<T>::infinity(),
+                                           std::numeric_limits<T>::infinity()};
+};
+
+// Plus
+
+template <typename T, class Op>
+using IsPlus = std::bool_constant<std::is_same_v<Op, sycl::plus<T>> ||
+                                  std::is_same_v<Op, std::plus<T>>>;
+
+template <typename T, class Op>
+using IsSyclPlus = std::bool_constant<std::is_same_v<Op, sycl::plus<T>>>;
+
+// Multiplies
+
+template <typename T, class Op>
+using IsMultiplies =
+    std::bool_constant<std::is_same_v<Op, sycl::multiplies<T>> ||
+                       std::is_same_v<Op, std::multiplies<T>>>;
+
+template <typename T, class Op>
+using IsSyclMultiplies =
+    std::bool_constant<std::is_same_v<Op, sycl::multiplies<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMultiplies<T, Op>::value>>
+{
+    static constexpr T value = static_cast<T>(1);
+};
+
+// LogSumExp
+
+template <typename T>
+struct LogSumExp
+{
+    T operator()(const T &x, const T &y) const
+    {
+        using dpctl::tensor::math_utils::logaddexp;
+        return logaddexp<T>(x, y);
+    }
+};
+
+template <typename T, class Op>
+using IsLogSumExp = std::bool_constant<std::is_same_v<Op, LogSumExp<T>>>;
+
+// only defined for types with infinity
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsLogSumExp<T, Op>::value>>
+{
+    static constexpr T value = -std::numeric_limits<T>::infinity();
+};
+
+// Hypot
+
+template <typename T>
+struct Hypot
+{
+    T operator()(const T &x, const T &y) const { return sycl::hypot(x, y); }
+};
+
+template <typename T, class Op>
+using IsHypot = std::bool_constant<std::is_same_v<Op, Hypot<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsHypot<T, Op>::value>>
+{
+    static constexpr T value = 0;
+};
+
+// Logical_And
+
+template <typename T, class Op>
+using IsLogicalAnd =
+    std::bool_constant<std::is_same_v<Op, sycl::logical_and<T>> ||
+                       std::is_same_v<Op, std::logical_and<T>>>;
+
+template <typename T, class Op>
+using IsSyclLogicalAnd =
+    std::bool_constant<std::is_same_v<Op, sycl::logical_and<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsLogicalAnd<T, Op>::value>>
+{
+    static constexpr T value = static_cast<T>(1);
+};
+
+// Logical_Or
+
+template <typename T, class Op>
+using IsLogicalOr =
+    std::bool_constant<std::is_same_v<Op, sycl::logical_or<T>> ||
+                       std::is_same_v<Op, std::logical_or<T>>>;
+
+template <typename T, class Op>
+using IsSyclLogicalOr =
+    std::bool_constant<std::is_same_v<Op, sycl::logical_or<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsLogicalOr<T, Op>::value>>
+{
+    static constexpr T value = static_cast<T>(0);
+};
+
+// Identity
+
+template <typename Op, typename T, typename = void>
+struct Identity
+{
+};
+
+template <typename Op, typename T>
+using UseBuiltInIdentity =
+    std::conjunction<IsSyclOp<T, Op>, sycl::has_known_identity<Op, T>>;
+
+template <typename Op, typename T>
+struct Identity<Op, T, std::enable_if_t<!UseBuiltInIdentity<Op, T>::value>>
+{
+    static constexpr T value = GetIdentity<Op, T>::value;
+};
+
+template <typename Op, typename T>
+struct Identity<Op, T, std::enable_if_t<UseBuiltInIdentity<Op, T>::value>>
+{
+    static constexpr T value = sycl::known_identity<Op, T>::value;
+};
+
+// Sub-group load/store
+
+#ifndef USE_GROUP_LOAD_STORE
+#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE) &&                               \
+    SYCL_EXT_ONEAPI_GROUP_LOAD_STORE
+#define USE_GROUP_LOAD_STORE 1
+#else
+#if defined(__LIBSYCL_MAJOR_VERSION) && (__LIBSYCL_MAJOR_VERSION >= 8u)
+#define USE_GROUP_LOAD_STORE 1
+#else
+#define USE_GROUP_LOAD_STORE 0
+#endif
+#endif
+#endif
+
+#if (USE_GROUP_LOAD_STORE)
+namespace ls_ns = sycl::ext::oneapi::experimental;
+#endif
+
+template <std::uint8_t vec_sz,
+          sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename ElementType>
+auto sub_group_load(const sycl::sub_group &sg,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
+    using ValueT = typename std::remove_cv_t<ElementType>;
+    sycl::vec<ValueT, vec_sz> x{};
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_load(sg, m_ptr, x, striped);
+    return x;
+#else
+    return sg.load<vec_sz>(m_ptr);
+#endif
+}
+
+template <sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename ElementType>
+auto sub_group_load(const sycl::sub_group &sg,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
+    using ValueT = typename std::remove_cv_t<ElementType>;
+    ValueT x{};
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_load(sg, m_ptr, x, striped);
+    return x;
+#else
+    return sg.load(m_ptr);
+#endif
+}
+
+template <std::uint8_t vec_sz,
+          sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename VecT,
+          typename ElementType>
+std::enable_if_t<
+    std::is_same_v<std::remove_cv_t<ElementType>, std::remove_cv_t<VecT>>,
+    void>
+    sub_group_store(const sycl::sub_group &sg,
+                    const sycl::vec<VecT, vec_sz> &val,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
+    static_assert(std::is_same_v<VecT, ElementType>);
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_store(sg, val, m_ptr, striped);
+    return;
+#else
+    sg.store<vec_sz>(m_ptr, val);
+    return;
+#endif
+}
+
+template <sycl::access::address_space Space,
+          sycl::access::decorated DecorateAddress,
+          typename VecT,
+          typename ElementType>
+std::enable_if_t<
+    std::is_same_v<std::remove_cv_t<ElementType>, std::remove_cv_t<VecT>>,
+    void>
+    sub_group_store(const sycl::sub_group &sg,
+                    const VecT &val,
+                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
+{
+#if (USE_GROUP_LOAD_STORE)
+    static constexpr auto striped =
+        ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_store(sg, val, m_ptr, striped);
+    return;
+#else
+    sg.store(m_ptr, val);
+    return;
+#endif
+}
+} // namespace dpctl::tensor::sycl_utils
diff --git a/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp b/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp
new file mode 100644
index 000000000000..bead0da5093e
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/utils/type_dispatch.hpp
@@ -0,0 +1,135 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines class to implement dispatch tables for pair of types
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+
+#include "dpnp4pybind11.hpp"
+
+#include "type_dispatch_building.hpp"
+
+namespace dpctl::tensor::type_dispatch
+{
+struct usm_ndarray_types
+{
+    int typenum_to_lookup_id(int typenum) const
+    {
+        using typenum_t = ::dpctl::tensor::type_dispatch::typenum_t;
+        auto const &api = ::dpctl::detail::dpctl_capi::get();
+
+        if (typenum == api.UAR_DOUBLE_) {
+            return static_cast<int>(typenum_t::DOUBLE);
+        }
+        else if (typenum == api.UAR_INT64_) {
+            return static_cast<int>(typenum_t::INT64);
+        }
+        else if (typenum == api.UAR_INT32_) {
+            return static_cast<int>(typenum_t::INT32);
+        }
+        else if (typenum == api.UAR_BOOL_) {
+            return static_cast<int>(typenum_t::BOOL);
+        }
+        else if (typenum == api.UAR_CDOUBLE_) {
+            return static_cast<int>(typenum_t::CDOUBLE);
+        }
+        else if (typenum == api.UAR_FLOAT_) {
+            return static_cast<int>(typenum_t::FLOAT);
+        }
+        else if (typenum == api.UAR_INT16_) {
+            return static_cast<int>(typenum_t::INT16);
+        }
+        else if (typenum == api.UAR_INT8_) {
+            return static_cast<int>(typenum_t::INT8);
+        }
+        else if (typenum == api.UAR_UINT64_) {
+            return static_cast<int>(typenum_t::UINT64);
+        }
+        else if (typenum == api.UAR_UINT32_) {
+            return static_cast<int>(typenum_t::UINT32);
+        }
+        else if (typenum == api.UAR_UINT16_) {
+            return static_cast<int>(typenum_t::UINT16);
+        }
+        else if (typenum == api.UAR_UINT8_) {
+            return static_cast<int>(typenum_t::UINT8);
+        }
+        else if (typenum == api.UAR_CFLOAT_) {
+            return static_cast<int>(typenum_t::CFLOAT);
+        }
+        else if (typenum == api.UAR_HALF_) {
+            return static_cast<int>(typenum_t::HALF);
+        }
+        else if (typenum == api.UAR_INT_ || typenum == api.UAR_UINT_) {
+            switch (sizeof(int)) {
+            case sizeof(std::int32_t):
+                return ((typenum == api.UAR_INT_)
+                            ? static_cast<int>(typenum_t::INT32)
+                            : static_cast<int>(typenum_t::UINT32));
+            case sizeof(std::int64_t):
+                return ((typenum == api.UAR_INT_)
+                            ? static_cast<int>(typenum_t::INT64)
+                            : static_cast<int>(typenum_t::UINT64));
+            default:
+                throw_unrecognized_typenum_error(typenum);
+            }
+        }
+        else if (typenum == api.UAR_LONGLONG_ ||
+                 typenum == api.UAR_ULONGLONG_) {
+            switch (sizeof(long long)) {
+            case sizeof(std::int64_t):
+                return ((typenum == api.UAR_LONGLONG_)
+                            ? static_cast<int>(typenum_t::INT64)
+                            : static_cast<int>(typenum_t::UINT64));
+            default:
+                throw_unrecognized_typenum_error(typenum);
+            }
+        }
+        else {
+            throw_unrecognized_typenum_error(typenum);
+        }
+        // return code signalling error, should never be reached
+        assert(false);
+        return -1;
+    }
+
+private:
+    void throw_unrecognized_typenum_error(int typenum) const
+    {
+        throw std::runtime_error("Unrecognized typenum " +
+                                 std::to_string(typenum) + " encountered.");
+    }
+};
+} // namespace dpctl::tensor::type_dispatch
diff --git a/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp
new file mode 100644
index 000000000000..7170624b5bbe
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/utils/type_dispatch_building.hpp
@@ -0,0 +1,293 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines class to implement dispatch tables for pair of types
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::type_dispatch
+{
+enum class typenum_t : int
+{
+    BOOL = 0,
+    INT8, // 1
+    UINT8,
+    INT16,
+    UINT16,
+    INT32, // 5
+    UINT32,
+    INT64,
+    UINT64,
+    HALF,
+    FLOAT, // 10
+    DOUBLE,
+    CFLOAT,
+    CDOUBLE, // 13
+};
+inline constexpr int num_types = 14; // number of elements in typenum_t
+
+template <typename funcPtrT,
+          template <typename fnT, typename D, typename S> typename factory,
+          int _num_types>
+class DispatchTableBuilder
+{
+private:
+    template <typename dstTy>
+    const std::vector<funcPtrT> row_per_dst_type() const
+    {
+        std::vector<funcPtrT> per_dstTy = {
+            factory<funcPtrT, dstTy, bool>{}.get(),
+            factory<funcPtrT, dstTy, std::int8_t>{}.get(),
+            factory<funcPtrT, dstTy, std::uint8_t>{}.get(),
+            factory<funcPtrT, dstTy, std::int16_t>{}.get(),
+            factory<funcPtrT, dstTy, std::uint16_t>{}.get(),
+            factory<funcPtrT, dstTy, std::int32_t>{}.get(),
+            factory<funcPtrT, dstTy, std::uint32_t>{}.get(),
+            factory<funcPtrT, dstTy, std::int64_t>{}.get(),
+            factory<funcPtrT, dstTy, std::uint64_t>{}.get(),
+            factory<funcPtrT, dstTy, sycl::half>{}.get(),
+            factory<funcPtrT, dstTy, float>{}.get(),
+            factory<funcPtrT, dstTy, double>{}.get(),
+            factory<funcPtrT, dstTy, std::complex<float>>{}.get(),
+            factory<funcPtrT, dstTy, std::complex<double>>{}.get()};
+        assert(per_dstTy.size() == _num_types);
+        return per_dstTy;
+    }
+
+public:
+    DispatchTableBuilder() = default;
+    ~DispatchTableBuilder() = default;
+
+    void populate_dispatch_table(funcPtrT table[][_num_types]) const
+    {
+        const auto map_by_dst_type = {row_per_dst_type<bool>(),
+                                      row_per_dst_type<std::int8_t>(),
+                                      row_per_dst_type<std::uint8_t>(),
+                                      row_per_dst_type<std::int16_t>(),
+                                      row_per_dst_type<std::uint16_t>(),
+                                      row_per_dst_type<std::int32_t>(),
+                                      row_per_dst_type<std::uint32_t>(),
+                                      row_per_dst_type<std::int64_t>(),
+                                      row_per_dst_type<std::uint64_t>(),
+                                      row_per_dst_type<sycl::half>(),
+                                      row_per_dst_type<float>(),
+                                      row_per_dst_type<double>(),
+                                      row_per_dst_type<std::complex<float>>(),
+                                      row_per_dst_type<std::complex<double>>()};
+        assert(map_by_dst_type.size() == _num_types);
+        int dst_id = 0;
+        for (const auto &row : map_by_dst_type) {
+            int src_id = 0;
+            for (const auto &fn_ptr : row) {
+                table[dst_id][src_id] = fn_ptr;
+                ++src_id;
+            }
+            ++dst_id;
+        }
+    }
+};
+
+template <typename funcPtrT,
+          template <typename fnT, typename T> typename factory,
+          int _num_types>
+class DispatchVectorBuilder
+{
+private:
+    template <typename Ty>
+    const funcPtrT func_per_type() const
+    {
+        funcPtrT f = factory<funcPtrT, Ty>{}.get();
+        return f;
+    }
+
+public:
+    DispatchVectorBuilder() = default;
+    ~DispatchVectorBuilder() = default;
+
+    void populate_dispatch_vector(funcPtrT vector[]) const
+    {
+        const auto fn_map_by_type = {func_per_type<bool>(),
+                                     func_per_type<std::int8_t>(),
+                                     func_per_type<std::uint8_t>(),
+                                     func_per_type<std::int16_t>(),
+                                     func_per_type<std::uint16_t>(),
+                                     func_per_type<std::int32_t>(),
+                                     func_per_type<std::uint32_t>(),
+                                     func_per_type<std::int64_t>(),
+                                     func_per_type<std::uint64_t>(),
+                                     func_per_type<sycl::half>(),
+                                     func_per_type<float>(),
+                                     func_per_type<double>(),
+                                     func_per_type<std::complex<float>>(),
+                                     func_per_type<std::complex<double>>()};
+        assert(fn_map_by_type.size() == _num_types);
+        int ty_id = 0;
+        for (const auto &fn : fn_map_by_type) {
+            vector[ty_id] = fn;
+            ++ty_id;
+        }
+    }
+};
+
+/*! @brief struct to define result_type typename for Ty == ArgTy */
+template <typename Ty, typename ArgTy, typename ResTy = ArgTy>
+struct TypeMapResultEntry : std::is_same<Ty, ArgTy>
+{
+    using result_type = ResTy;
+};
+
+/*! @brief struct to define result_type typename for Ty1 == ArgTy1 && Ty2 ==
+ * ArgTy2 */
+template <typename Ty1,
+          typename ArgTy1,
+          typename Ty2,
+          typename ArgTy2,
+          typename ResTy>
+struct BinaryTypeMapResultEntry
+    : std::conjunction<std::is_same<Ty1, ArgTy1>, std::is_same<Ty2, ArgTy2>>
+{
+    using result_type = ResTy;
+};
+
+/*! @brief fall-through struct with specified result_type, usually void */
+template <typename Ty = void>
+struct DefaultResultEntry : std::true_type
+{
+    using result_type = Ty;
+};
+
+/*! @brief Utility struct to convert C++ type into typeid integer */
+template <typename T>
+struct GetTypeid
+{
+    int get()
+    {
+        if constexpr (std::is_same_v<T, bool>) {
+            return static_cast<int>(typenum_t::BOOL);
+        }
+        else if constexpr (std::is_same_v<T, std::int8_t>) {
+            return static_cast<int>(typenum_t::INT8);
+        }
+        else if constexpr (std::is_same_v<T, std::uint8_t>) {
+            return static_cast<int>(typenum_t::UINT8);
+        }
+        else if constexpr (std::is_same_v<T, std::int16_t>) {
+            return static_cast<int>(typenum_t::INT16);
+        }
+        else if constexpr (std::is_same_v<T, std::uint16_t>) {
+            return static_cast<int>(typenum_t::UINT16);
+        }
+        else if constexpr (std::is_same_v<T, std::int32_t>) {
+            return static_cast<int>(typenum_t::INT32);
+        }
+        else if constexpr (std::is_same_v<T, std::uint32_t>) {
+            return static_cast<int>(typenum_t::UINT32);
+        }
+        else if constexpr (std::is_same_v<T, std::int64_t>) {
+            return static_cast<int>(typenum_t::INT64);
+        }
+        else if constexpr (std::is_same_v<T, std::uint64_t>) {
+            return static_cast<int>(typenum_t::UINT64);
+        }
+        else if constexpr (std::is_same_v<T, sycl::half>) {
+            return static_cast<int>(typenum_t::HALF);
+        }
+        else if constexpr (std::is_same_v<T, float>) {
+            return static_cast<int>(typenum_t::FLOAT);
+        }
+        else if constexpr (std::is_same_v<T, double>) {
+            return static_cast<int>(typenum_t::DOUBLE);
+        }
+        else if constexpr (std::is_same_v<T, std::complex<float>>) {
+            return static_cast<int>(typenum_t::CFLOAT);
+        }
+        else if constexpr (std::is_same_v<T, std::complex<double>>) {
+            return static_cast<int>(typenum_t::CDOUBLE);
+        }
+        else if constexpr (std::is_same_v<T, void>) { // special token
+            return -1;
+        }
+
+        assert(("Unsupported type T", false));
+        return -2;
+    }
+};
+
+/*! @brief Class to generate vector of null function pointers */
+template <typename FunPtrT>
+struct NullPtrVector
+{
+
+    using value_type = FunPtrT;
+    using const_reference = value_type const &;
+
+    NullPtrVector() : val(nullptr) {}
+
+    const_reference operator[](int) const { return val; }
+
+private:
+    value_type val;
+};
+
+/*! @brief Class to generate table of null function pointers */
+template <typename FunPtrT>
+struct NullPtrTable
+{
+    using value_type = NullPtrVector<FunPtrT>;
+    using const_reference = value_type const &;
+
+    NullPtrTable() : val() {}
+
+    const_reference operator[](int) const { return val; }
+
+private:
+    value_type val;
+};
+
+template <typename Ty1, typename ArgTy, typename Ty2, typename outTy>
+struct TypePairDefinedEntry
+    : std::conjunction<std::is_same<Ty1, ArgTy>, std::is_same<Ty2, outTy>>
+{
+    static constexpr bool is_defined = true;
+};
+
+struct NotDefinedEntry : std::true_type
+{
+    static constexpr bool is_defined = false;
+};
+} // namespace dpctl::tensor::type_dispatch
diff --git a/dpnp/tensor/libtensor/include/utils/type_utils.hpp b/dpnp/tensor/libtensor/include/utils/type_utils.hpp
new file mode 100644
index 000000000000..47b1a5554815
--- /dev/null
+++ b/dpnp/tensor/libtensor/include/utils/type_utils.hpp
@@ -0,0 +1,163 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+///
+/// \file
+/// This file defines functions for value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::type_utils
+{
+template <typename T, typename = void>
+struct is_complex : public std::false_type
+{
+};
+
+template <typename T>
+struct is_complex<
+    T,
+    std::enable_if_t<std::is_same_v<std::remove_cv_t<T>, std::complex<float>> ||
+                     std::is_same_v<std::remove_cv_t<T>, std::complex<double>>>>
+    : public std::true_type
+{
+};
+
+template <typename T>
+inline constexpr bool is_complex_v = is_complex<T>::value;
+
+template <typename dstTy, typename srcTy>
+dstTy convert_impl(const srcTy &v)
+{
+    if constexpr (std::is_same_v<dstTy, srcTy>) {
+        return v;
+    }
+    else if constexpr (std::is_same_v<dstTy, bool>) {
+        if constexpr (is_complex_v<srcTy>) {
+            // bool(complex_v) ==
+            //     (complex_v.real() != 0) && (complex_v.imag() !=0)
+            return (convert_impl<bool, typename srcTy::value_type>(v.real()) ||
+                    convert_impl<bool, typename srcTy::value_type>(v.imag()));
+        }
+        else {
+            return static_cast<dstTy>(v != srcTy{0});
+        }
+    }
+    else if constexpr (std::is_same_v<srcTy, bool>) {
+        // C++ interprets a byte of storage behind bool by only
+        // testing is least significant bit, leading to both
+        // 0x00 and 0x02 interpreted as False, while 0x01 and 0xFF
+        // interpreted as True. NumPy's interpretation of underlying
+        // storage is different: any bit set is interpreted as True,
+        // no bits set as False, see gh-2121
+        const std::uint8_t &u = sycl::bit_cast<std::uint8_t>(v);
+        if constexpr (is_complex_v<dstTy>) {
+            return (u == 0) ? dstTy{} : dstTy{1, 0};
+        }
+        else {
+            return (u == 0) ? dstTy{} : dstTy{1};
+        }
+    }
+    else if constexpr (is_complex_v<srcTy> && !is_complex_v<dstTy>) {
+        // real_t(complex_v) == real_t(complex_v.real())
+        return convert_impl<dstTy, typename srcTy::value_type>(v.real());
+    }
+    else if constexpr (!std::is_integral_v<srcTy> &&
+                       !std::is_same_v<dstTy, bool> &&
+                       std::is_integral_v<dstTy> && std::is_unsigned_v<dstTy>) {
+        // first cast to signed variant, the cast to unsigned one
+        using signedT = typename std::make_signed_t<dstTy>;
+        return static_cast<dstTy>(convert_impl<signedT, srcTy>(v));
+    }
+    else {
+        return static_cast<dstTy>(v);
+    }
+}
+
+template <typename T>
+void validate_type_for_device(const sycl::device &d)
+{
+    if constexpr (std::is_same_v<T, double>) {
+        if (!d.has(sycl::aspect::fp64)) {
+            throw std::runtime_error("Device " +
+                                     d.get_info<sycl::info::device::name>() +
+                                     " does not support type 'float64'");
+        }
+    }
+    else if constexpr (std::is_same_v<T, std::complex<double>>) {
+        if (!d.has(sycl::aspect::fp64)) {
+            throw std::runtime_error("Device " +
+                                     d.get_info<sycl::info::device::name>() +
+                                     " does not support type 'complex128'");
+        }
+    }
+    else if constexpr (std::is_same_v<T, sycl::half>) {
+        if (!d.has(sycl::aspect::fp16)) {
+            throw std::runtime_error("Device " +
+                                     d.get_info<sycl::info::device::name>() +
+                                     " does not support type 'float16'");
+        }
+    }
+}
+
+template <typename T>
+void validate_type_for_device(const sycl::queue &q)
+{
+    validate_type_for_device<T>(q.get_device());
+}
+
+template <typename Op, typename Vec, std::size_t... I>
+auto vec_cast_impl(const Vec &v, std::index_sequence<I...>)
+{
+    return Op{v[I]...};
+}
+
+template <typename dstT,
+          typename srcT,
+          std::size_t N,
+          typename Indices = std::make_index_sequence<N>>
+auto vec_cast(const sycl::vec<srcT, N> &s)
+{
+    if constexpr (std::is_same_v<srcT, dstT>) {
+        return s;
+    }
+    else {
+        return vec_cast_impl<sycl::vec<dstT, N>, sycl::vec<srcT, N>>(s,
+                                                                     Indices{});
+    }
+}
+} // namespace dpctl::tensor::type_utils
diff --git a/dpnp/tensor/libtensor/source/accumulators.cpp b/dpnp/tensor/libtensor/source/accumulators.cpp
new file mode 100644
index 000000000000..c6ab96418d47
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/accumulators.cpp
@@ -0,0 +1,407 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/accumulators.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+// Computation of positions of masked elements
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t;
+static cumsum_val_contig_impl_fn_ptr_t
+    mask_positions_contig_i64_dispatch_vector[td_ns::num_types];
+static cumsum_val_contig_impl_fn_ptr_t
+    mask_positions_contig_i32_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t;
+static cumsum_val_strided_impl_fn_ptr_t
+    mask_positions_strided_i64_dispatch_vector[td_ns::num_types];
+static cumsum_val_strided_impl_fn_ptr_t
+    mask_positions_strided_i32_dispatch_vector[td_ns::num_types];
+
+void populate_mask_positions_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsContigFactoryForInt64;
+    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
+                                 MaskPositionsContigFactoryForInt64,
+                                 td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(mask_positions_contig_i64_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsContigFactoryForInt32;
+    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
+                                 MaskPositionsContigFactoryForInt32,
+                                 td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(mask_positions_contig_i32_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
+                                 MaskPositionsStridedFactoryForInt64,
+                                 td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(mask_positions_strided_i64_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
+                                 MaskPositionsStridedFactoryForInt32,
+                                 td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(mask_positions_strided_i32_dispatch_vector);
+
+    return;
+}
+
+std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
+                              const dpctl::tensor::usm_ndarray &cumsum,
+                              sycl::queue &exec_q,
+                              const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(cumsum);
+
+    // cumsum is 1D
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("Result array must be one-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array must be C-contiguous.");
+    }
+
+    // cumsum.shape == (mask.size,)
+    auto mask_size = mask.get_size();
+    auto cumsum_size = cumsum.get_shape(0);
+    if (cumsum_size != mask_size) {
+        throw py::value_error("Inconsistent dimensions");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) {
+        // FIXME: use ExecutionPlacementError
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (mask_size == 0) {
+        return 0;
+    }
+
+    int mask_typenum = mask.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    // mask can be any type
+    const char *mask_data = mask.get_data();
+    char *cumsum_data = cumsum.get_data();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+
+    int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    // cumsum must be int32_t/int64_t only
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Cumulative sum array must have int32 or int64 data-type.");
+    }
+
+    const bool use_i32 = (cumsum_typeid == int32_typeid);
+
+    std::vector<sycl::event> host_task_events;
+
+    if (mask.is_c_contiguous()) {
+        auto fn = (use_i32)
+                      ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
+                      : mask_positions_contig_i64_dispatch_vector[mask_typeid];
+
+        std::size_t total_set;
+
+        {
+            py::gil_scoped_release release;
+
+            total_set = fn(exec_q, mask_size, mask_data, cumsum_data,
+                           host_task_events, depends);
+
+            sycl::event::wait(host_task_events);
+        }
+        return total_set;
+    }
+
+    const py::ssize_t *shape = mask.get_shape_raw();
+    auto const &strides_vector = mask.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT compact_shape;
+    shT compact_strides;
+
+    int mask_nd = mask.get_ndim();
+    int nd = mask_nd;
+
+    compact_iteration_space(nd, shape, strides_vector, compact_shape,
+                            compact_strides);
+
+    // Strided implementation
+    auto strided_fn =
+        (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid]
+                  : mask_positions_strided_i64_dispatch_vector[mask_typeid];
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, compact_shape, compact_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    if (2 * static_cast<std::size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
+        {
+            py::gil_scoped_release release;
+
+            copy_shape_ev.wait();
+            sycl::event::wait(host_task_events);
+
+            // ensure deleter of smart pointer is invoked with GIL released
+            shape_strides_owner.reset(nullptr);
+        }
+        throw std::runtime_error("Unexpected error");
+    }
+
+    std::vector<sycl::event> dependent_events;
+    dependent_events.reserve(depends.size() + 1);
+    dependent_events.insert(dependent_events.end(), copy_shape_ev);
+    dependent_events.insert(dependent_events.end(), depends.begin(),
+                            depends.end());
+
+    std::size_t total_set;
+
+    {
+        py::gil_scoped_release release;
+
+        total_set = strided_fn(exec_q, mask_size, mask_data, nd, shape_strides,
+                               cumsum_data, host_task_events, dependent_events);
+
+        sycl::event::wait(host_task_events);
+        // ensure deleter of smart pointer is invoked with GIL released
+        shape_strides_owner.reset(nullptr);
+    }
+
+    return total_set;
+}
+
+using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t;
+static cumsum_val_strided_impl_fn_ptr_t
+    cumsum_1d_strided_dispatch_vector[td_ns::num_types];
+using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t;
+static cumsum_val_contig_impl_fn_ptr_t
+    cumsum_1d_contig_dispatch_vector[td_ns::num_types];
+
+void populate_cumsum_1d_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::accumulators::Cumsum1DContigFactory;
+    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
+                                 Cumsum1DContigFactory, td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cumsum_1d_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::Cumsum1DStridedFactory;
+    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
+                                 Cumsum1DStridedFactory, td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cumsum_1d_strided_dispatch_vector);
+
+    return;
+}
+
+std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
+                         const dpctl::tensor::usm_ndarray &cumsum,
+                         sycl::queue &exec_q,
+                         std::vector<sycl::event> const &depends)
+{
+    // cumsum is 1D
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be one-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
+    }
+
+    // cumsum.shape == (src.size,)
+    auto src_size = src.get_size();
+    auto cumsum_size = cumsum.get_shape(0);
+    if (cumsum_size != src_size) {
+        throw py::value_error("Inconsistent dimensions");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum})) {
+        // FIXME: use ExecutionPlacementError
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(cumsum);
+
+    if (src_size == 0) {
+        return 0;
+    }
+
+    int src_typenum = src.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    // src can be any type
+    const char *src_data = src.get_data();
+    char *cumsum_data = cumsum.get_data();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    // this cumsum must be int64_t only
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Cumulative sum array must have int64 data-type.");
+    }
+
+    std::vector<sycl::event> host_task_events;
+
+    if (src.is_c_contiguous()) {
+        auto fn = cumsum_1d_contig_dispatch_vector[src_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error(
+                "this cumsum requires integer type, got src_typeid=" +
+                std::to_string(src_typeid));
+        }
+        std::size_t total = fn(exec_q, src_size, src_data, cumsum_data,
+                               host_task_events, depends);
+        {
+            py::gil_scoped_release release;
+            sycl::event::wait(host_task_events);
+        }
+        return total;
+    }
+
+    const py::ssize_t *shape = src.get_shape_raw();
+    auto const &strides_vector = src.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT compact_shape;
+    shT compact_strides;
+
+    int src_nd = src.get_ndim();
+    int nd = src_nd;
+
+    compact_iteration_space(nd, shape, strides_vector, compact_shape,
+                            compact_strides);
+
+    // Strided implementation
+    auto strided_fn = cumsum_1d_strided_dispatch_vector[src_typeid];
+    if (strided_fn == nullptr) {
+        throw std::runtime_error(
+            "this cumsum requires integer type, got src_typeid=" +
+            std::to_string(src_typeid));
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, compact_shape, compact_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    if (2 * static_cast<std::size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
+        {
+            py::gil_scoped_release release;
+
+            copy_shape_ev.wait();
+            sycl::event::wait(host_task_events);
+
+            // ensure USM deleter is called with GIL released
+            shape_strides_owner.reset(nullptr);
+        }
+        throw std::runtime_error("Unexpected error");
+    }
+
+    std::vector<sycl::event> dependent_events;
+    dependent_events.reserve(depends.size() + 1);
+    dependent_events.insert(dependent_events.end(), copy_shape_ev);
+    dependent_events.insert(dependent_events.end(), depends.begin(),
+                            depends.end());
+
+    std::size_t total =
+        strided_fn(exec_q, src_size, src_data, nd, shape_strides, cumsum_data,
+                   host_task_events, dependent_events);
+
+    {
+        py::gil_scoped_release release;
+        sycl::event::wait(host_task_events);
+
+        // ensure USM deleter is called with GIL released
+        shape_strides_owner.reset(nullptr);
+    }
+
+    return total;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/accumulators.hpp b/dpnp/tensor/libtensor/source/accumulators.hpp
new file mode 100644
index 000000000000..e400aad2dceb
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/accumulators.hpp
@@ -0,0 +1,61 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void populate_mask_positions_dispatch_vectors(void);
+
+extern std::size_t
+    py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
+                      const dpctl::tensor::usm_ndarray &cumsum,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends = {});
+
+extern void populate_cumsum_1d_dispatch_vectors(void);
+
+extern std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
+                                const dpctl::tensor::usm_ndarray &cumsum,
+                                sycl::queue &exec_q,
+                                std::vector<sycl::event> const &depends = {});
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
new file mode 100644
index 000000000000..bce47c45f9b1
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
@@ -0,0 +1,461 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <exception>
+#include <iterator>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/accumulators.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event>
+    py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src,
+                            const int trailing_dims_to_accumulate,
+                            const dpctl::tensor::usm_ndarray &dst,
+                            sycl::queue &exec_q,
+                            std::vector<sycl::event> const &depends,
+                            const strided_fnT &strided_dispatch_table,
+                            const contig_fnT &contig_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iter_nd = src_nd - trailing_dims_to_accumulate;
+    if (trailing_dims_to_accumulate <= 0 || iter_nd < 0) {
+        throw py::value_error(
+            "trailing_dims_to_accumulate must be positive, but no "
+            "greater than rank of the input array");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+    for (int i = 0; same_shapes && (i < iter_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    std::size_t acc_nelems(1);
+    for (int i = iter_nd; same_shapes && (i < src_nd); ++i) {
+        auto dst_shape_i = dst_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_i);
+        acc_nelems *= static_cast<std::size_t>(dst_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (acc_nelems == 0)) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, acc_nelems * iter_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    std::vector<sycl::event> host_task_events;
+
+    if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) {
+        auto fn = contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error("Datatypes are not supported");
+        }
+
+        sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data,
+                                host_task_events, depends);
+
+        return std::make_pair(
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}),
+            acc_ev);
+    }
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    int acc_nd = trailing_dims_to_accumulate;
+
+    using shT = std::vector<py::ssize_t>;
+    shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec));
+
+    shT acc_src_strides(std::begin(src_strides_vec) + iter_nd,
+                        std::end(src_strides_vec));
+
+    shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd,
+                        std::end(dst_strides_vec));
+
+    shT iter_shape(std::begin(src_shape_vec),
+                   std::begin(src_shape_vec) + iter_nd);
+
+    shT iter_src_strides(std::begin(src_strides_vec),
+                         std::begin(src_strides_vec) + iter_nd);
+
+    shT iter_dst_strides(std::begin(dst_strides_vec),
+                         std::begin(dst_strides_vec) + iter_nd);
+
+    shT simplified_iter_shape;
+    shT simplified_iter_src_strides;
+    shT simplified_iter_dst_strides;
+    py::ssize_t iter_src_offset(0);
+    py::ssize_t iter_dst_offset(0);
+
+    if (iter_nd == 0) {
+        iter_nd = 1;
+        simplified_iter_shape.push_back(1);
+        simplified_iter_src_strides.push_back(0);
+        simplified_iter_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
+            // output
+            simplified_iter_shape, simplified_iter_src_strides,
+            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
+    }
+
+    // Strided implementation
+    auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid];
+    if (strided_fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_iter_shape,
+        simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape,
+        acc_src_strides, acc_dst_strides);
+    auto packed_shapes_and_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const auto &copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shapes_and_strides =
+        packed_shapes_and_strides_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
+    const py::ssize_t *acc_shapes_and_strides =
+        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), copy_shapes_strides_ev);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+
+    sycl::event acc_ev = strided_fn(
+        exec_q, iter_nelems, acc_nelems, src_data, iter_nd,
+        iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd,
+        acc_shapes_and_strides, dst_data, host_task_events, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {acc_ev}, packed_shapes_and_strides_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events),
+        acc_ev);
+}
+
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event> py_accumulate_final_axis_include_initial(
+    const dpctl::tensor::usm_ndarray &src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    std::vector<sycl::event> const &depends,
+    const strided_fnT &strided_dispatch_table,
+    const contig_fnT &contig_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+
+    static constexpr int acc_nd = 1;
+
+    int iter_nd = src_nd - acc_nd;
+    if (iter_nd < 0) {
+        throw py::value_error("accumulation axis must not be greater than rank "
+                              "of the input array");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+    for (int i = 0; same_shapes && (i < iter_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    std::size_t acc_nelems(1);
+    for (int i = iter_nd; same_shapes && (i < src_nd); ++i) {
+        auto dst_shape_i = dst_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_ptr[i] + 1 == dst_shape_i);
+        acc_nelems *= static_cast<std::size_t>(dst_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (acc_nelems == 0)) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, acc_nelems * iter_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    std::vector<sycl::event> host_task_events;
+
+    if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) {
+        auto fn = contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error("Datatypes are not supported");
+        }
+
+        sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data,
+                                host_task_events, depends);
+
+        return std::make_pair(
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}),
+            acc_ev);
+    }
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec));
+
+    shT acc_src_strides(std::begin(src_strides_vec) + iter_nd,
+                        std::end(src_strides_vec));
+
+    shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd,
+                        std::end(dst_strides_vec));
+
+    shT iter_shape(std::begin(src_shape_vec),
+                   std::begin(src_shape_vec) + iter_nd);
+
+    shT iter_src_strides(std::begin(src_strides_vec),
+                         std::begin(src_strides_vec) + iter_nd);
+
+    shT iter_dst_strides(std::begin(dst_strides_vec),
+                         std::begin(dst_strides_vec) + iter_nd);
+
+    shT simplified_iter_shape;
+    shT simplified_iter_src_strides;
+    shT simplified_iter_dst_strides;
+    py::ssize_t iter_src_offset(0);
+    py::ssize_t iter_dst_offset(0);
+
+    if (iter_nd == 0) {
+        iter_nd = 1;
+        simplified_iter_shape.push_back(1);
+        simplified_iter_src_strides.push_back(0);
+        simplified_iter_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
+            // output
+            simplified_iter_shape, simplified_iter_src_strides,
+            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
+    }
+
+    // Strided implementation
+    auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid];
+    if (strided_fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_iter_shape,
+        simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape,
+        acc_src_strides, acc_dst_strides);
+    auto packed_shapes_and_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const auto &copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shapes_and_strides =
+        packed_shapes_and_strides_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
+    const py::ssize_t *acc_shapes_and_strides =
+        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), copy_shapes_strides_ev);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+
+    sycl::event acc_ev = strided_fn(
+        exec_q, iter_nelems, acc_nelems, src_data, iter_nd,
+        iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd,
+        acc_shapes_and_strides, dst_data, host_task_events, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {acc_ev}, packed_shapes_and_strides_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events),
+        acc_ev);
+}
+
+/*! @brief Template implementing Python API for querying accumulation
+ * type support */
+template <typename fnT>
+bool py_accumulate_dtype_supported(const py::dtype &input_dtype,
+                                   const py::dtype &output_dtype,
+                                   const fnT &dispatch_table)
+{
+    int arg_tn =
+        input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int out_tn =
+        output_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int arg_typeid = -1;
+    int out_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
+        out_typeid = array_types.typenum_to_lookup_id(out_tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
+        out_typeid >= td_ns::num_types) {
+        throw std::runtime_error("Reduction type support check: lookup failed");
+    }
+
+    // remove_all_extents gets underlying type of table
+    using fn_ptrT = typename std::remove_all_extents<fnT>::type;
+    fn_ptrT fn = nullptr;
+
+    fn = dispatch_table[arg_typeid][out_typeid];
+
+    return (fn != nullptr);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/accumulators/accumulators_common.cpp b/dpnp/tensor/libtensor/source/accumulators/accumulators_common.cpp
new file mode 100644
index 000000000000..5e07e81b7ad5
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/accumulators/accumulators_common.cpp
@@ -0,0 +1,55 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "cumulative_logsumexp.hpp"
+#include "cumulative_prod.hpp"
+#include "cumulative_sum.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+/*! @brief Add accumulators to Python module */
+void init_accumulator_functions(py::module_ m)
+{
+    init_cumulative_logsumexp(m);
+    init_cumulative_prod(m);
+    init_cumulative_sum(m);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/accumulators/accumulators_common.hpp b/dpnp/tensor/libtensor/source/accumulators/accumulators_common.hpp
new file mode 100644
index 000000000000..c33a040a7fa7
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/accumulators/accumulators_common.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_accumulator_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
new file mode 100644
index 000000000000..d4961c9edbf1
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
@@ -0,0 +1,343 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "accumulate_over_axis.hpp"
+#include "kernels/accumulators.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace su_ns = dpctl::tensor::sycl_utils;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumlogsumexp_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    cumlogsumexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumlogsumexp_1d_include_initial_contig_dispatch_table[td_ns::num_types]
+                                                         [td_ns::num_types];
+
+static accumulate_strided_impl_fn_ptr_t
+    cumlogsumexp_include_initial_strided_dispatch_table[td_ns::num_types]
+                                                       [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForLogSumExpAccumulation
+{
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExp1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined) {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExp1DIncludeInitialContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined) {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExpStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined) {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumLogSumExpIncludeInitialStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
+                          srcTy, dstTy>::is_defined) {
+            using ScanOpT = su_ns::LogSumExp<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_cumlogsumexp_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumLogSumExp1DContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(cumlogsumexp_1d_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumLogSumExpStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(cumlogsumexp_strided_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumLogSumExp1DIncludeInitialContigFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        cumlogsumexp_1d_include_initial_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumLogSumExpIncludeInitialStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        cumlogsumexp_include_initial_strided_dispatch_table);
+
+    return;
+}
+
+} // namespace impl
+
+void init_cumulative_logsumexp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    using impl::populate_cumlogsumexp_dispatch_tables;
+    populate_cumlogsumexp_dispatch_tables();
+
+    using impl::cumlogsumexp_1d_contig_dispatch_table;
+    using impl::cumlogsumexp_strided_dispatch_table;
+    auto cumlogsumexp_pyapi = [&](const arrayT &src,
+                                  int trailing_dims_to_accumulate,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+        return py_accumulate_over_axis(src, trailing_dims_to_accumulate, dst,
+                                       exec_q, depends,
+                                       cumlogsumexp_strided_dispatch_table,
+                                       cumlogsumexp_1d_contig_dispatch_table);
+    };
+    m.def("_cumlogsumexp_over_axis", cumlogsumexp_pyapi, "", py::arg("src"),
+          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    using impl::cumlogsumexp_1d_include_initial_contig_dispatch_table;
+    using impl::cumlogsumexp_include_initial_strided_dispatch_table;
+    auto cumlogsumexp_include_initial_pyapi =
+        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+            const event_vecT &depends = {}) {
+            return py_accumulate_final_axis_include_initial(
+                src, dst, exec_q, depends,
+                cumlogsumexp_include_initial_strided_dispatch_table,
+                cumlogsumexp_1d_include_initial_contig_dispatch_table);
+        };
+    m.def("_cumlogsumexp_final_axis_include_initial",
+          cumlogsumexp_include_initial_pyapi, "", py::arg("src"),
+          py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto cumlogsumexp_dtype_supported = [&](const py::dtype &input_dtype,
+                                            const py::dtype &output_dtype) {
+        return py_accumulate_dtype_supported(
+            input_dtype, output_dtype, cumlogsumexp_strided_dispatch_table);
+    };
+    m.def("_cumlogsumexp_dtype_supported", cumlogsumexp_dtype_supported, "",
+          py::arg("arg_dtype"), py::arg("out_dtype"));
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
new file mode 100644
index 000000000000..f1292320bd0d
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cumulative_logsumexp(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp
new file mode 100644
index 000000000000..319709b30a76
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.cpp
@@ -0,0 +1,352 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "accumulate_over_axis.hpp"
+#include "kernels/accumulators.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumprod_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    cumprod_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumprod_1d_include_initial_contig_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+static accumulate_strided_impl_fn_ptr_t
+    cumprod_include_initial_strided_dispatch_table[td_ns::num_types]
+                                                  [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProdAccumulation
+{
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename T>
+using CumProdScanOpT = std::conditional_t<std::is_same_v<T, bool>,
+                                          sycl::logical_and<T>,
+                                          sycl::multiplies<T>>;
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProd1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<
+                          srcTy, dstTy>::is_defined) {
+            using ScanOpT = CumProdScanOpT<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProd1DIncludeInitialContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<
+                          srcTy, dstTy>::is_defined) {
+            using ScanOpT = CumProdScanOpT<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProdStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<
+                          srcTy, dstTy>::is_defined) {
+            using ScanOpT = CumProdScanOpT<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumProdIncludeInitialStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForProdAccumulation<
+                          srcTy, dstTy>::is_defined) {
+            using ScanOpT = CumProdScanOpT<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_cumprod_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumProd1DContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(cumprod_1d_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumProdStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(cumprod_strided_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumProd1DIncludeInitialContigFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        cumprod_1d_include_initial_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumProdIncludeInitialStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        cumprod_include_initial_strided_dispatch_table);
+
+    return;
+}
+
+} // namespace impl
+
+void init_cumulative_prod(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    using impl::populate_cumprod_dispatch_tables;
+    populate_cumprod_dispatch_tables();
+
+    using impl::cumprod_1d_contig_dispatch_table;
+    using impl::cumprod_strided_dispatch_table;
+    auto cumprod_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+        return py_accumulate_over_axis(
+            src, trailing_dims_to_accumulate, dst, exec_q, depends,
+            cumprod_strided_dispatch_table, cumprod_1d_contig_dispatch_table);
+    };
+    m.def("_cumprod_over_axis", cumprod_pyapi, "", py::arg("src"),
+          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    using impl::cumprod_1d_include_initial_contig_dispatch_table;
+    using impl::cumprod_include_initial_strided_dispatch_table;
+    auto cumprod_include_initial_pyapi =
+        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+            const event_vecT &depends = {}) {
+            return py_accumulate_final_axis_include_initial(
+                src, dst, exec_q, depends,
+                cumprod_include_initial_strided_dispatch_table,
+                cumprod_1d_include_initial_contig_dispatch_table);
+        };
+    m.def("_cumprod_final_axis_include_initial", cumprod_include_initial_pyapi,
+          "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto cumprod_dtype_supported = [&](const py::dtype &input_dtype,
+                                       const py::dtype &output_dtype) {
+        return py_accumulate_dtype_supported(input_dtype, output_dtype,
+                                             cumprod_strided_dispatch_table);
+    };
+    m.def("_cumprod_dtype_supported", cumprod_dtype_supported, "",
+          py::arg("arg_dtype"), py::arg("out_dtype"));
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.hpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.hpp
new file mode 100644
index 000000000000..e14bb2c44361
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_prod.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cumulative_prod(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp
new file mode 100644
index 000000000000..f700883af2a1
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.cpp
@@ -0,0 +1,350 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "accumulate_over_axis.hpp"
+#include "kernels/accumulators.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumsum_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    cumsum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static accumulate_1d_contig_impl_fn_ptr_t
+    cumsum_1d_include_initial_contig_dispatch_table[td_ns::num_types]
+                                                   [td_ns::num_types];
+
+static accumulate_strided_impl_fn_ptr_t
+    cumsum_include_initial_strided_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumAccumulation
+{
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename T>
+using CumSumScanOpT = std::
+    conditional_t<std::is_same_v<T, bool>, sycl::logical_or<T>, sycl::plus<T>>;
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSum1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<
+                          srcTy, dstTy>::is_defined) {
+            using ScanOpT = CumSumScanOpT<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSum1DIncludeInitialContigFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<
+                          srcTy, dstTy>::is_defined) {
+            using ScanOpT = CumSumScanOpT<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              NoOpTransformer<dstTy>, ScanOpT,
+                                              include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_1d_contig_impl<srcTy, dstTy,
+                                              CastTransformer<srcTy, dstTy>,
+                                              ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSumStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<
+                          srcTy, dstTy>::is_defined) {
+            using ScanOpT = CumSumScanOpT<dstTy>;
+            static constexpr bool include_initial = false;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct CumSumIncludeInitialStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (TypePairSupportDataForSumAccumulation<
+                          srcTy, dstTy>::is_defined) {
+            using ScanOpT = CumSumScanOpT<dstTy>;
+            static constexpr bool include_initial = true;
+            if constexpr (std::is_same_v<srcTy, dstTy>) {
+                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            NoOpTransformer<dstTy>, ScanOpT,
+                                            include_initial>;
+                return fn;
+            }
+            else {
+                using dpctl::tensor::kernels::accumulators::CastTransformer;
+                fnT fn = dpctl::tensor::kernels::accumulators::
+                    accumulate_strided_impl<srcTy, dstTy,
+                                            CastTransformer<srcTy, dstTy>,
+                                            ScanOpT, include_initial>;
+                return fn;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_cumsum_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumSum1DContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(cumsum_1d_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumSumStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(cumsum_strided_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
+                                CumSum1DIncludeInitialContigFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        cumsum_1d_include_initial_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
+                                CumSumIncludeInitialStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(cumsum_include_initial_strided_dispatch_table);
+
+    return;
+}
+
+} // namespace impl
+
+void init_cumulative_sum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    using impl::populate_cumsum_dispatch_tables;
+    populate_cumsum_dispatch_tables();
+
+    using impl::cumsum_1d_contig_dispatch_table;
+    using impl::cumsum_strided_dispatch_table;
+    auto cumsum_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate,
+                            const arrayT &dst, sycl::queue &exec_q,
+                            const event_vecT &depends = {}) {
+        return py_accumulate_over_axis(
+            src, trailing_dims_to_accumulate, dst, exec_q, depends,
+            cumsum_strided_dispatch_table, cumsum_1d_contig_dispatch_table);
+    };
+    m.def("_cumsum_over_axis", cumsum_pyapi, "", py::arg("src"),
+          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    using impl::cumsum_1d_include_initial_contig_dispatch_table;
+    using impl::cumsum_include_initial_strided_dispatch_table;
+    auto cumsum_include_initial_pyapi =
+        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+            const event_vecT &depends = {}) {
+            return py_accumulate_final_axis_include_initial(
+                src, dst, exec_q, depends,
+                cumsum_include_initial_strided_dispatch_table,
+                cumsum_1d_include_initial_contig_dispatch_table);
+        };
+    m.def("_cumsum_final_axis_include_initial", cumsum_include_initial_pyapi,
+          "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto cumsum_dtype_supported = [&](const py::dtype &input_dtype,
+                                      const py::dtype &output_dtype) {
+        return py_accumulate_dtype_supported(input_dtype, output_dtype,
+                                             cumsum_strided_dispatch_table);
+    };
+    m.def("_cumsum_dtype_supported", cumsum_dtype_supported, "",
+          py::arg("arg_dtype"), py::arg("out_dtype"));
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.hpp b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.hpp
new file mode 100644
index 000000000000..5e06b222a3bc
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/accumulators/cumulative_sum.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cumulative_sum(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp
new file mode 100644
index 000000000000..146be45e4858
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -0,0 +1,853 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines implementation functions of dpctl.tensor.place and
+/// dpctl.tensor.extract, dpctl.tensor.nonzero
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <numeric>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "boolean_advanced_indexing.hpp"
+#include "kernels/boolean_advanced_indexing.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+// Masked extraction
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::indexing::
+    masked_extract_all_slices_strided_impl_fn_ptr_t;
+
+static masked_extract_all_slices_strided_impl_fn_ptr_t
+    masked_extract_all_slices_strided_i32_impl_dispatch_vector
+        [td_ns::num_types];
+static masked_extract_all_slices_strided_impl_fn_ptr_t
+    masked_extract_all_slices_strided_i64_impl_dispatch_vector
+        [td_ns::num_types];
+
+using dpctl::tensor::kernels::indexing::
+    masked_extract_all_slices_contig_impl_fn_ptr_t;
+
+static masked_extract_all_slices_contig_impl_fn_ptr_t
+    masked_extract_all_slices_contig_i32_impl_dispatch_vector[td_ns::num_types];
+static masked_extract_all_slices_contig_impl_fn_ptr_t
+    masked_extract_all_slices_contig_i64_impl_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::indexing::
+    masked_extract_some_slices_strided_impl_fn_ptr_t;
+
+static masked_extract_some_slices_strided_impl_fn_ptr_t
+    masked_extract_some_slices_strided_i32_impl_dispatch_vector
+        [td_ns::num_types];
+static masked_extract_some_slices_strided_impl_fn_ptr_t
+    masked_extract_some_slices_strided_i64_impl_dispatch_vector
+        [td_ns::num_types];
+
+void populate_masked_extract_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractAllSlicesStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<
+        masked_extract_all_slices_strided_impl_fn_ptr_t,
+        MaskExtractAllSlicesStridedFactoryForInt32, td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(
+        masked_extract_all_slices_strided_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractAllSlicesStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<
+        masked_extract_all_slices_strided_impl_fn_ptr_t,
+        MaskExtractAllSlicesStridedFactoryForInt64, td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(
+        masked_extract_all_slices_strided_i64_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractSomeSlicesStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<
+        masked_extract_some_slices_strided_impl_fn_ptr_t,
+        MaskExtractSomeSlicesStridedFactoryForInt32, td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(
+        masked_extract_some_slices_strided_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractSomeSlicesStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<
+        masked_extract_some_slices_strided_impl_fn_ptr_t,
+        MaskExtractSomeSlicesStridedFactoryForInt64, td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(
+        masked_extract_some_slices_strided_i64_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractAllSlicesContigFactoryForInt32;
+    td_ns::DispatchVectorBuilder<masked_extract_all_slices_contig_impl_fn_ptr_t,
+                                 MaskExtractAllSlicesContigFactoryForInt32,
+                                 td_ns::num_types>
+        dvb5;
+    dvb5.populate_dispatch_vector(
+        masked_extract_all_slices_contig_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskExtractAllSlicesContigFactoryForInt64;
+    td_ns::DispatchVectorBuilder<masked_extract_all_slices_contig_impl_fn_ptr_t,
+                                 MaskExtractAllSlicesContigFactoryForInt64,
+                                 td_ns::num_types>
+        dvb6;
+    dvb6.populate_dispatch_vector(
+        masked_extract_all_slices_contig_i64_impl_dispatch_vector);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_extract(const dpctl::tensor::usm_ndarray &src,
+               const dpctl::tensor::usm_ndarray &cumsum,
+               int axis_start, // axis_start <= mask_i < axis_end
+               int axis_end,
+               const dpctl::tensor::usm_ndarray &dst,
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int src_nd = src.get_ndim();
+    if ((axis_start < 0 || axis_end > src_nd || axis_start >= axis_end)) {
+        throw py::value_error("Specified axes_start and axes_end are invalid.");
+    }
+    int mask_span_sz = axis_end - axis_start;
+
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd + (mask_span_sz - 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be a C-contiguous vector");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    py::ssize_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool same_ortho_dims(true);
+    std::size_t ortho_nelems(1); // number of orthogonal iterations
+
+    for (auto i = 0; i < axis_start; ++i) {
+        auto src_sh_i = src_shape[i];
+        ortho_nelems *= src_sh_i;
+        same_ortho_dims = same_ortho_dims && (src_sh_i == dst_shape[i]);
+    }
+    for (auto i = axis_end; i < src_nd; ++i) {
+        auto src_sh_i = src_shape[i];
+        ortho_nelems *= src_sh_i;
+        same_ortho_dims =
+            same_ortho_dims && (src_sh_i == dst_shape[i - (mask_span_sz - 1)]);
+    }
+
+    std::size_t masked_src_nelems(1);
+    std::size_t masked_dst_nelems(dst_shape[axis_start]);
+    for (auto i = axis_start; i < axis_end; ++i) {
+        masked_src_nelems *= src_shape[i];
+    }
+
+    // masked_dst_nelems is number of set elements in the mask, or last element
+    // in cumsum
+    if (!same_ortho_dims ||
+        (masked_src_nelems != static_cast<std::size_t>(cumsum_sz))) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, ortho_nelems * masked_dst_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src, not with cumsum.
+    if (overlap(dst, cumsum) || overlap(dst, src)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
+        throw py::value_error("Unexpected data type of cumsum array, expecting "
+                              "'int32' or 'int64'");
+    }
+
+    const bool use_i32 = (cumsum_typeid == int32_typeid);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data types");
+    }
+
+    char *src_data_p = src.get_data();
+    char *dst_data_p = dst.get_data();
+    char *cumsum_data_p = cumsum.get_data();
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    sycl::event extract_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis_start == 0 && axis_end == src_nd) {
+        assert(dst_shape_vec.size() == 1);
+        assert(dst_strides_vec.size() == 1);
+
+        if (src.is_c_contiguous()) {
+            auto fn =
+                (use_i32)
+                    ? masked_extract_all_slices_contig_i32_impl_dispatch_vector
+                          [src_typeid]
+                    : masked_extract_all_slices_contig_i64_impl_dispatch_vector
+                          [src_typeid];
+
+            extract_ev =
+                fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p, dst_data_p,
+                   dst_shape_vec[0], dst_strides_vec[0], depends);
+
+            //
+            host_task_events.push_back(extract_ev);
+        }
+        else {
+            // empty orthogonal directions
+            auto fn =
+                (use_i32)
+                    ? masked_extract_all_slices_strided_i32_impl_dispatch_vector
+                          [src_typeid]
+                    : masked_extract_all_slices_strided_i64_impl_dispatch_vector
+                          [src_typeid];
+
+            using dpctl::tensor::offset_utils::device_allocate_and_pack;
+            auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+                exec_q, host_task_events, src_shape_vec, src_strides_vec);
+            auto packed_src_shape_strides_owner =
+                std::move(std::get<0>(ptr_size_event_tuple1));
+            sycl::event copy_src_shape_strides_ev =
+                std::get<2>(ptr_size_event_tuple1);
+            const py::ssize_t *packed_src_shape_strides =
+                packed_src_shape_strides_owner.get();
+
+            std::vector<sycl::event> all_deps;
+            all_deps.reserve(depends.size() + 1);
+            all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+            all_deps.push_back(copy_src_shape_strides_ev);
+
+            assert(all_deps.size() == depends.size() + 1);
+
+            extract_ev = fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p,
+                            dst_data_p, src_nd, packed_src_shape_strides,
+                            dst_shape_vec[0], dst_strides_vec[0], all_deps);
+
+            sycl::event cleanup_tmp_allocations_ev =
+                dpctl::tensor::alloc_utils::async_smart_free(
+                    exec_q, {extract_ev}, packed_src_shape_strides_owner);
+            host_task_events.push_back(cleanup_tmp_allocations_ev);
+        }
+    }
+    else {
+        // non-empty orthogonal directions
+        auto fn =
+            (use_i32)
+                ? masked_extract_some_slices_strided_i32_impl_dispatch_vector
+                      [src_typeid]
+                : masked_extract_some_slices_strided_i64_impl_dispatch_vector
+                      [src_typeid];
+
+        int masked_src_nd = mask_span_sz;
+        int ortho_nd = src_nd - masked_src_nd;
+
+        using shT = std::vector<py::ssize_t>;
+
+        shT ortho_src_shape;
+        shT masked_src_shape;
+        shT ortho_src_strides;
+        shT masked_src_strides;
+        split_iteration_space(src_shape_vec, src_strides_vec, axis_start,
+                              axis_end, ortho_src_shape,
+                              masked_src_shape, // 4 vectors modified
+                              ortho_src_strides, masked_src_strides);
+
+        shT ortho_dst_shape;
+        shT masked_dst_shape;
+        shT ortho_dst_strides;
+        shT masked_dst_strides;
+        split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start,
+                              axis_start + 1, ortho_dst_shape,
+                              masked_dst_shape, // 4 vectors modified
+                              ortho_dst_strides, masked_dst_strides);
+
+        assert(ortho_src_shape.size() == static_cast<std::size_t>(ortho_nd));
+        assert(ortho_dst_shape.size() == static_cast<std::size_t>(ortho_nd));
+        assert(std::equal(ortho_src_shape.begin(), ortho_src_shape.end(),
+                          ortho_dst_shape.begin()));
+
+        std::vector<py::ssize_t> simplified_ortho_shape;
+        std::vector<py::ssize_t> simplified_ortho_src_strides;
+        std::vector<py::ssize_t> simplified_ortho_dst_strides;
+
+        const py::ssize_t *_shape = ortho_src_shape.data();
+
+        py::ssize_t ortho_src_offset(0);
+        py::ssize_t ortho_dst_offset(0);
+
+        simplify_iteration_space(
+            ortho_nd, _shape, ortho_src_strides, ortho_dst_strides,
+            // output
+            simplified_ortho_shape, simplified_ortho_src_strides,
+            simplified_ortho_dst_strides, ortho_src_offset, ortho_dst_offset);
+
+        assert(masked_dst_shape.size() == 1);
+        assert(masked_dst_strides.size() == 1);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_ortho_shape,
+            simplified_ortho_src_strides, simplified_ortho_dst_strides,
+            masked_src_shape, masked_src_strides);
+        auto packed_shapes_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_shapes_strides =
+            packed_shapes_strides_owner.get();
+
+        const py::ssize_t *packed_ortho_src_dst_shape_strides =
+            packed_shapes_strides;
+        const py::ssize_t *packed_masked_src_shape_strides =
+            packed_shapes_strides + (3 * ortho_nd);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        // OrthogIndexerT orthog_src_dst_indexer_, MaskedIndexerT
+        // masked_src_indexer_, MaskedIndexerT masked_dst_indexer_
+        extract_ev = fn(exec_q, ortho_nelems, masked_src_nelems, src_data_p,
+                        cumsum_data_p, dst_data_p,
+                        // data to build orthog_src_dst_indexer
+                        ortho_nd, packed_ortho_src_dst_shape_strides,
+                        ortho_src_offset, ortho_dst_offset,
+                        // data to build masked_src_indexer
+                        masked_src_nd, packed_masked_src_shape_strides,
+                        // data to build masked_dst_indexer,
+                        masked_dst_shape[0], masked_dst_strides[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {extract_ev}, packed_shapes_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {src, cumsum, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, extract_ev);
+}
+
+// Masked placement
+
+using dpctl::tensor::kernels::indexing::
+    masked_place_all_slices_strided_impl_fn_ptr_t;
+
+static masked_place_all_slices_strided_impl_fn_ptr_t
+    masked_place_all_slices_strided_i32_impl_dispatch_vector[td_ns::num_types];
+static masked_place_all_slices_strided_impl_fn_ptr_t
+    masked_place_all_slices_strided_i64_impl_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::indexing::
+    masked_place_some_slices_strided_impl_fn_ptr_t;
+
+static masked_place_some_slices_strided_impl_fn_ptr_t
+    masked_place_some_slices_strided_i32_impl_dispatch_vector[td_ns::num_types];
+static masked_place_some_slices_strided_impl_fn_ptr_t
+    masked_place_some_slices_strided_i64_impl_dispatch_vector[td_ns::num_types];
+
+void populate_masked_place_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::indexing::
+        MaskPlaceAllSlicesStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<masked_place_all_slices_strided_impl_fn_ptr_t,
+                                 MaskPlaceAllSlicesStridedFactoryForInt32,
+                                 td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(
+        masked_place_all_slices_strided_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskPlaceAllSlicesStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<masked_place_all_slices_strided_impl_fn_ptr_t,
+                                 MaskPlaceAllSlicesStridedFactoryForInt64,
+                                 td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(
+        masked_place_all_slices_strided_i64_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskPlaceSomeSlicesStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<masked_place_some_slices_strided_impl_fn_ptr_t,
+                                 MaskPlaceSomeSlicesStridedFactoryForInt32,
+                                 td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(
+        masked_place_some_slices_strided_i32_impl_dispatch_vector);
+
+    using dpctl::tensor::kernels::indexing::
+        MaskPlaceSomeSlicesStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<masked_place_some_slices_strided_impl_fn_ptr_t,
+                                 MaskPlaceSomeSlicesStridedFactoryForInt64,
+                                 td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(
+        masked_place_some_slices_strided_i64_impl_dispatch_vector);
+}
+
+/*
+ * @brief Copy dst[i, ortho_id] = rhs[cumsum[i] - 1, ortho_id]  if cumsum[i] ==
+ * ((i > 0) ? cumsum[i-1] + 1 : 1)
+ */
+std::pair<sycl::event, sycl::event>
+    py_place(const dpctl::tensor::usm_ndarray &dst,
+             const dpctl::tensor::usm_ndarray &cumsum,
+             int axis_start, // axis_start <= mask_i < axis_end
+             int axis_end,
+             const dpctl::tensor::usm_ndarray &rhs,
+             sycl::queue &exec_q,
+             const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int dst_nd = dst.get_ndim();
+    if ((axis_start < 0 || axis_end > dst_nd || axis_start >= axis_end)) {
+        throw py::value_error("Specified axes_start and axes_end are invalid.");
+    }
+    int mask_span_sz = axis_end - axis_start;
+
+    int rhs_nd = rhs.get_ndim();
+    if (dst_nd != rhs_nd + (mask_span_sz - 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be a C-contiguous vector");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst, cumsum, rhs})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    py::ssize_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    const py::ssize_t *rhs_shape = rhs.get_shape_raw();
+    bool same_ortho_dims(true);
+    std::size_t ortho_nelems(1); // number of orthogonal iterations
+
+    for (auto i = 0; i < axis_start; ++i) {
+        auto dst_sh_i = dst_shape[i];
+        ortho_nelems *= dst_sh_i;
+        same_ortho_dims = same_ortho_dims && (dst_sh_i == rhs_shape[i]);
+    }
+    for (auto i = axis_end; i < dst_nd; ++i) {
+        auto dst_sh_i = dst_shape[i];
+        ortho_nelems *= dst_sh_i;
+        same_ortho_dims =
+            same_ortho_dims && (dst_sh_i == rhs_shape[i - (mask_span_sz - 1)]);
+    }
+
+    std::size_t masked_dst_nelems(1);
+    for (auto i = axis_start; i < axis_end; ++i) {
+        masked_dst_nelems *= dst_shape[i];
+    }
+
+    if (!same_ortho_dims ||
+        (masked_dst_nelems != static_cast<std::size_t>(cumsum_sz))) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, ortho_nelems * masked_dst_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src, not with cumsum.
+    if (overlap(dst, rhs) || overlap(dst, cumsum)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int dst_typenum = dst.get_typenum();
+    int rhs_typenum = rhs.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
+        throw py::value_error("Unexpected data type of cumsum array, expecting "
+                              "'int32' or 'int64'");
+    }
+
+    const bool use_i32 = (cumsum_typeid == int32_typeid);
+
+    if (dst_typeid != rhs_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data types");
+    }
+
+    char *dst_data_p = dst.get_data();
+    char *rhs_data_p = rhs.get_data();
+    char *cumsum_data_p = cumsum.get_data();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    auto rhs_shape_vec = rhs.get_shape_vector();
+    auto rhs_strides_vec = rhs.get_strides_vector();
+
+    sycl::event place_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis_start == 0 && axis_end == dst_nd) {
+        // empty orthogonal directions
+        auto fn = (use_i32)
+                      ? masked_place_all_slices_strided_i32_impl_dispatch_vector
+                            [dst_typeid]
+                      : masked_place_all_slices_strided_i64_impl_dispatch_vector
+                            [dst_typeid];
+
+        assert(rhs_shape_vec.size() == 1);
+        assert(rhs_strides_vec.size() == 1);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, dst_shape_vec, dst_strides_vec);
+        auto packed_dst_shape_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_dst_shape_strides_ev =
+            std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_dst_shape_strides =
+            packed_dst_shape_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_dst_shape_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        place_ev = fn(exec_q, cumsum_sz, dst_data_p, cumsum_data_p, rhs_data_p,
+                      dst_nd, packed_dst_shape_strides, rhs_shape_vec[0],
+                      rhs_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {place_ev}, packed_dst_shape_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+    else {
+        // non-empty orthogonal directions
+        auto fn =
+            (use_i32)
+                ? masked_place_some_slices_strided_i32_impl_dispatch_vector
+                      [dst_typeid]
+                : masked_place_some_slices_strided_i64_impl_dispatch_vector
+                      [dst_typeid];
+
+        int masked_dst_nd = mask_span_sz;
+        int ortho_nd = dst_nd - masked_dst_nd;
+
+        using shT = std::vector<py::ssize_t>;
+
+        shT ortho_dst_shape;
+        shT masked_dst_shape;
+        shT ortho_dst_strides;
+        shT masked_dst_strides;
+        split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start,
+                              axis_end, ortho_dst_shape,
+                              masked_dst_shape, // 4 vectors modified
+                              ortho_dst_strides, masked_dst_strides);
+
+        shT ortho_rhs_shape;
+        shT masked_rhs_shape;
+        shT ortho_rhs_strides;
+        shT masked_rhs_strides;
+        split_iteration_space(rhs_shape_vec, rhs_strides_vec, axis_start,
+                              axis_start + 1, ortho_rhs_shape,
+                              masked_rhs_shape, // 4 vectors modified
+                              ortho_rhs_strides, masked_rhs_strides);
+
+        assert(ortho_dst_shape.size() == static_cast<std::size_t>(ortho_nd));
+        assert(ortho_rhs_shape.size() == static_cast<std::size_t>(ortho_nd));
+        assert(std::equal(ortho_dst_shape.begin(), ortho_dst_shape.end(),
+                          ortho_rhs_shape.begin()));
+
+        std::vector<py::ssize_t> simplified_ortho_shape;
+        std::vector<py::ssize_t> simplified_ortho_dst_strides;
+        std::vector<py::ssize_t> simplified_ortho_rhs_strides;
+
+        const py::ssize_t *_shape = ortho_dst_shape.data();
+
+        py::ssize_t ortho_dst_offset(0);
+        py::ssize_t ortho_rhs_offset(0);
+
+        simplify_iteration_space(
+            ortho_nd, _shape, ortho_dst_strides, ortho_rhs_strides,
+            simplified_ortho_shape, simplified_ortho_dst_strides,
+            simplified_ortho_rhs_strides, ortho_dst_offset, ortho_rhs_offset);
+
+        assert(masked_rhs_shape.size() == 1);
+        assert(masked_rhs_strides.size() == 1);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_ortho_shape,
+            simplified_ortho_dst_strides, simplified_ortho_rhs_strides,
+            masked_dst_shape, masked_dst_strides);
+        auto packed_shapes_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_shapes_strides =
+            packed_shapes_strides_owner.get();
+
+        const py::ssize_t *packed_ortho_dst_rhs_shape_strides =
+            packed_shapes_strides;
+        const py::ssize_t *packed_masked_dst_shape_strides =
+            packed_shapes_strides + (3 * ortho_nd);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        place_ev = fn(exec_q, ortho_nelems, masked_dst_nelems, dst_data_p,
+                      cumsum_data_p, rhs_data_p,
+                      // data to build orthog_dst_rhs_indexer
+                      ortho_nd, packed_ortho_dst_rhs_shape_strides,
+                      ortho_dst_offset, ortho_rhs_offset,
+                      // data to build masked_dst_indexer
+                      masked_dst_nd, packed_masked_dst_shape_strides,
+                      // data to build masked_dst_indexer,
+                      masked_rhs_shape[0], masked_rhs_strides[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {place_ev}, packed_shapes_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {dst, cumsum, rhs}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, place_ev);
+}
+
+// Non-zero
+
+std::pair<sycl::event, sycl::event>
+    py_nonzero(const dpctl::tensor::usm_ndarray
+                   &cumsum, // int32/int64 input array, 1D, C-contiguous
+               const dpctl::tensor::usm_ndarray
+                   &indexes, // int32/int64 2D output array, C-contiguous
+               const std::vector<py::ssize_t>
+                   &mask_shape, // shape of array from which cumsum was computed
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends)
+{
+    if (!dpctl::utils::queues_are_compatible(exec_q, {cumsum, indexes})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(indexes);
+
+    int cumsum_nd = cumsum.get_ndim();
+    if (cumsum_nd != 1 || !cumsum.is_c_contiguous()) {
+        throw py::value_error("Cumsum array must be a C-contiguous vector");
+    }
+
+    int indexes_nd = indexes.get_ndim();
+    if (indexes_nd != 2 || !indexes.is_c_contiguous()) {
+        throw py::value_error("Index array must be a C-contiguous matrix");
+    }
+
+    std::size_t _ndim = mask_shape.size();
+    if (_ndim > std::numeric_limits<int>::max()) {
+        throw py::value_error("Shape is too large");
+    }
+    int ndim = static_cast<int>(_ndim);
+
+    const py::ssize_t *indexes_shape = indexes.get_shape_raw();
+
+    if (ndim != indexes_shape[0]) {
+        throw py::value_error(
+            "Length of shape must equal width of index matrix");
+    }
+
+    auto cumsum_sz = cumsum.get_size();
+    py::ssize_t shape_nelems =
+        std::accumulate(mask_shape.begin(), mask_shape.end(), py::ssize_t(1),
+                        std::multiplies<py::ssize_t>());
+
+    if (cumsum_sz != shape_nelems) {
+        throw py::value_error("Shape and cumsum size are not consistent");
+    }
+
+    py::ssize_t nz_elems = indexes_shape[1];
+
+    int indexes_typenum = indexes.get_typenum();
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int indexes_typeid = array_types.typenum_to_lookup_id(indexes_typenum);
+
+    int cumsum_typenum = cumsum.get_typenum();
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    constexpr int int32_typeid = static_cast<int>(td_ns::typenum_t::INT32);
+    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
+
+    // cumsum must be int32_t or int64_t only
+    if ((cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) ||
+        (indexes_typeid != int32_typeid && indexes_typeid != int64_typeid)) {
+        throw py::value_error("Cumulative sum array and index array must have "
+                              "int32 or int64 data-type");
+    }
+
+    if (cumsum_sz == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(cumsum, indexes)) {
+        throw py::value_error("Arrays are expected to ave no memory overlap");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        indexes, nz_elems * _ndim);
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto mask_shape_copying_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, mask_shape);
+    auto src_shape_device_owner =
+        std::move(std::get<0>(mask_shape_copying_tuple));
+    sycl::event copy_ev = std::get<2>(mask_shape_copying_tuple);
+    const py::ssize_t *src_shape_device_ptr = src_shape_device_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_ev);
+
+    using dpctl::tensor::kernels::indexing::non_zero_indexes_fn_ptr_t;
+    using dpctl::tensor::kernels::indexing::non_zero_indexes_impl;
+
+    int fn_index = ((cumsum_typeid == int64_typeid) ? 1 : 0) +
+                   ((indexes_typeid == int64_typeid) ? 2 : 0);
+    std::array<non_zero_indexes_fn_ptr_t, 4> fn_impls = {
+        non_zero_indexes_impl<std::int32_t, std::int32_t>,
+        non_zero_indexes_impl<std::int64_t, std::int32_t>,
+        non_zero_indexes_impl<std::int32_t, std::int64_t>,
+        non_zero_indexes_impl<std::int64_t, std::int64_t>};
+    auto fn = fn_impls[fn_index];
+
+    sycl::event non_zero_indexes_ev =
+        fn(exec_q, cumsum_sz, nz_elems, ndim, cumsum.get_data(),
+           indexes.get_data(), src_shape_device_ptr, all_deps);
+
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {non_zero_indexes_ev}, src_shape_device_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {cumsum, indexes}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, non_zero_indexes_ev);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.hpp
new file mode 100644
index 000000000000..71eafc77b00c
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/boolean_advanced_indexing.hpp
@@ -0,0 +1,81 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    py_extract(const dpctl::tensor::usm_ndarray &src,
+               const dpctl::tensor::usm_ndarray &cumsum,
+               int axis_start, // axis_start <= mask_i < axis_end
+               int axis_end,
+               const dpctl::tensor::usm_ndarray &dst,
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends = {});
+
+extern void populate_masked_extract_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event>
+    py_place(const dpctl::tensor::usm_ndarray &dst,
+             const dpctl::tensor::usm_ndarray &cumsum,
+             int axis_start, // axis_start <= mask_i < axis_end
+             int axis_end,
+             const dpctl::tensor::usm_ndarray &rhs,
+             sycl::queue &exec_q,
+             const std::vector<sycl::event> &depends = {});
+
+extern void populate_masked_place_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event>
+    py_nonzero(const dpctl::tensor::usm_ndarray
+                   &cumsum, // int32 input array, 1D, C-contiguous
+               const dpctl::tensor::usm_ndarray
+                   &indexes, // int32 2D output array, C-contiguous
+               const std::vector<py::ssize_t>
+                   &mask_shape, // shape of array from which cumsum was computed
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends = {});
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/clip.cpp b/dpnp/tensor/libtensor/source/clip.cpp
new file mode 100644
index 000000000000..4a0e5b9357de
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/clip.cpp
@@ -0,0 +1,263 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines Python API for implementation functions of
+/// dpctl.tensor.clip
+//===---------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "clip.hpp"
+#include "kernels/clip.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::clip::clip_contig_impl_fn_ptr_t;
+using dpctl::tensor::kernels::clip::clip_strided_impl_fn_ptr_t;
+
+static clip_contig_impl_fn_ptr_t clip_contig_dispatch_vector[td_ns::num_types];
+static clip_strided_impl_fn_ptr_t
+    clip_strided_dispatch_vector[td_ns::num_types];
+
+void init_clip_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::clip::ClipContigFactory;
+    DispatchVectorBuilder<clip_contig_impl_fn_ptr_t, ClipContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(clip_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::clip::ClipStridedFactory;
+    DispatchVectorBuilder<clip_strided_impl_fn_ptr_t, ClipStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(clip_strided_dispatch_vector);
+}
+
+using dpctl::utils::keep_args_alive;
+
+std::pair<sycl::event, sycl::event>
+    py_clip(const dpctl::tensor::usm_ndarray &src,
+            const dpctl::tensor::usm_ndarray &min,
+            const dpctl::tensor::usm_ndarray &max,
+            const dpctl::tensor::usm_ndarray &dst,
+            sycl::queue &exec_q,
+            const std::vector<sycl::event> &depends)
+{
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, min, max, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int nd = src.get_ndim();
+    int min_nd = min.get_ndim();
+    int max_nd = max.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (nd != min_nd || nd != max_nd) {
+        throw py::value_error(
+            "Input arrays are not of appropriate dimension for clip kernel.");
+    }
+
+    if (nd != dst_nd) {
+        throw py::value_error(
+            "Destination is not of appropriate dimension for clip kernel.");
+    }
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *min_shape = min.get_shape_raw();
+    const py::ssize_t *max_shape = max.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t nelems(1);
+    for (int i = 0; i < nd; ++i) {
+        const auto &sh_i = dst_shape[i];
+        nelems *= static_cast<std::size_t>(sh_i);
+        shapes_equal = shapes_equal && (min_shape[i] == sh_i) &&
+                       (max_shape[i] == sh_i) && (src_shape[i] == sh_i);
+    }
+
+    if (!shapes_equal) {
+        throw py::value_error("Arrays are not of matching shapes.");
+    }
+
+    if (nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(dst, src) && !same_logical_tensors(dst, src)) ||
+        (overlap(dst, min) && !same_logical_tensors(dst, min)) ||
+        (overlap(dst, max) && !same_logical_tensors(dst, max))) {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    int min_typenum = min.get_typenum();
+    int max_typenum = max.get_typenum();
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int min_typeid = array_types.typenum_to_lookup_id(min_typenum);
+    int max_typeid = array_types.typenum_to_lookup_id(max_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid || src_typeid != min_typeid ||
+        src_typeid != max_typeid) {
+        throw py::value_error("Input, min, max, and destination arrays must "
+                              "have the same data type");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, nelems);
+
+    char *src_data = src.get_data();
+    char *min_data = min.get_data();
+    char *max_data = max.get_data();
+    char *dst_data = dst.get_data();
+
+    bool is_min_c_contig = min.is_c_contiguous();
+    bool is_min_f_contig = min.is_f_contiguous();
+
+    bool is_max_c_contig = max.is_c_contiguous();
+    bool is_max_f_contig = max.is_f_contiguous();
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    bool all_c_contig = (is_min_c_contig && is_max_c_contig &&
+                         is_src_c_contig && is_dst_c_contig);
+    bool all_f_contig = (is_min_f_contig && is_max_f_contig &&
+                         is_src_f_contig && is_dst_f_contig);
+
+    if (all_c_contig || all_f_contig) {
+        auto fn = clip_contig_dispatch_vector[src_typeid];
+
+        sycl::event clip_ev =
+            fn(exec_q, nelems, src_data, min_data, max_data, dst_data, depends);
+        sycl::event ht_ev =
+            keep_args_alive(exec_q, {src, min, max, dst}, {clip_ev});
+
+        return std::make_pair(ht_ev, clip_ev);
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &min_strides = min.get_strides_vector();
+    auto const &max_strides = max.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_min_strides;
+    shT simplified_max_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t min_offset(0);
+    py::ssize_t max_offset(0);
+    py::ssize_t dst_offset(0);
+
+    simplify_iteration_space_4(
+        nd, src_shape, src_strides, min_strides, max_strides, dst_strides,
+        // outputs
+        simplified_shape, simplified_src_strides, simplified_min_strides,
+        simplified_max_strides, simplified_dst_strides, src_offset, min_offset,
+        max_offset, dst_offset);
+
+    auto fn = clip_strided_dispatch_vector[src_typeid];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // common shape and strides
+        simplified_shape, simplified_src_strides, simplified_min_strides,
+        simplified_max_strides, simplified_dst_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event clip_ev = fn(exec_q, nelems, nd, src_data, min_data, max_data,
+                             dst_data, packed_shape_strides, src_offset,
+                             min_offset, max_offset, dst_offset, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {clip_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {src, min, max, dst}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, clip_ev);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/clip.hpp b/dpnp/tensor/libtensor/source/clip.hpp
new file mode 100644
index 000000000000..de8f0e559b6e
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/clip.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines Python API for implementation functions of
+/// dpctl.tensor.clip
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    py_clip(const dpctl::tensor::usm_ndarray &src,
+            const dpctl::tensor::usm_ndarray &min,
+            const dpctl::tensor::usm_ndarray &max,
+            const dpctl::tensor::usm_ndarray &dst,
+            sycl::queue &exec_q,
+            const std::vector<sycl::event> &depends);
+
+extern void init_clip_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
new file mode 100644
index 000000000000..7c2db989b0c2
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
@@ -0,0 +1,296 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <sycl/sycl.hpp>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/copy_and_cast.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "copy_as_contig.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_contig_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t;
+
+static copy_and_cast_generic_fn_ptr_t
+    copy_and_cast_generic_dispatch_table[td_ns::num_types][td_ns::num_types];
+static copy_and_cast_1d_fn_ptr_t
+    copy_and_cast_1d_dispatch_table[td_ns::num_types][td_ns::num_types];
+static copy_and_cast_contig_fn_ptr_t
+    copy_and_cast_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+namespace py = pybind11;
+
+using dpctl::utils::keep_args_alive;
+
+std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
+    const dpctl::tensor::usm_ndarray &src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {})
+{
+    // array dimensions must be the same
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // shapes must be the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; shapes_equal && (i < src_nd); ++i) {
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        // TODO: could use a temporary, but this is done by the caller
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    // check for applicability of special cases:
+    //      (both C-contiguous || both F-contiguous)
+    bool both_c_contig = (is_src_c_contig && is_dst_c_contig);
+    bool both_f_contig = (is_src_f_contig && is_dst_f_contig);
+    if (both_c_contig || both_f_contig) {
+
+        sycl::event copy_ev;
+        if (src_type_id == dst_type_id) {
+
+            int src_elem_size = src.get_elemsize();
+
+            copy_ev = exec_q.memcpy(static_cast<void *>(dst_data),
+                                    static_cast<const void *>(src_data),
+                                    src_nelems * src_elem_size, depends);
+        }
+        else {
+            auto contig_fn =
+                copy_and_cast_contig_dispatch_table[dst_type_id][src_type_id];
+            copy_ev =
+                contig_fn(exec_q, src_nelems, src_data, dst_data, depends);
+        }
+        // make sure src and dst are not GC-ed before copy_ev is complete
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    if ((src_type_id == dst_type_id) && (src_nd > 1)) {
+        if (is_dst_c_contig) {
+            return py_as_c_contig(src, dst, exec_q, depends);
+        }
+        else if (is_dst_f_contig) {
+            return py_as_f_contig(src, dst, exec_q, depends);
+        }
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+    const py::ssize_t *shape = src_shape;
+
+    // nd, simplified_* and *_offset are modified by reference
+    simplify_iteration_space(nd, shape, src_strides, dst_strides,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (nd < 2) {
+        if (nd == 1) {
+            std::array<py::ssize_t, 1> shape_arr = {simplified_shape[0]};
+            std::array<py::ssize_t, 1> src_strides_arr = {
+                simplified_src_strides[0]};
+            std::array<py::ssize_t, 1> dst_strides_arr = {
+                simplified_dst_strides[0]};
+
+            sycl::event copy_and_cast_1d_event;
+            if ((src_strides_arr[0] == 1) && (dst_strides_arr[0] == 1) &&
+                (src_offset == 0) && (dst_offset == 0)) {
+                auto contig_fn =
+                    copy_and_cast_contig_dispatch_table[dst_type_id]
+                                                       [src_type_id];
+                copy_and_cast_1d_event =
+                    contig_fn(exec_q, src_nelems, src_data, dst_data, depends);
+            }
+            else {
+                auto fn =
+                    copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
+                copy_and_cast_1d_event =
+                    fn(exec_q, src_nelems, shape_arr, src_strides_arr,
+                       dst_strides_arr, src_data, src_offset, dst_data,
+                       dst_offset, depends);
+            }
+            return std::make_pair(
+                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}),
+                copy_and_cast_1d_event);
+        }
+        else if (nd == 0) { // case of a scalar
+            assert(src_nelems == 1);
+            std::array<py::ssize_t, 1> shape_arr = {1};
+            std::array<py::ssize_t, 1> src_strides_arr = {1};
+            std::array<py::ssize_t, 1> dst_strides_arr = {1};
+
+            auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
+
+            sycl::event copy_and_cast_0d_event = fn(
+                exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
+                src_data, src_offset, dst_data, dst_offset, depends);
+
+            return std::make_pair(
+                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}),
+                copy_and_cast_0d_event);
+        }
+    }
+
+    // Generic implementation
+    auto copy_and_cast_fn =
+        copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    const sycl::event &copy_and_cast_generic_ev = copy_and_cast_fn(
+        exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data,
+        dst_offset, depends, {copy_shape_ev});
+
+    // async free of shape_strides temporary
+    const auto &temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {copy_and_cast_generic_ev}, shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_and_cast_generic_ev);
+}
+
+void init_copy_and_cast_usm_to_usm_dispatch_tables(void)
+{
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastContigFactory;
+    DispatchTableBuilder<copy_and_cast_contig_fn_ptr_t,
+                         CopyAndCastContigFactory, num_types>
+        dtb_contig;
+    dtb_contig.populate_dispatch_table(copy_and_cast_contig_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory;
+    DispatchTableBuilder<copy_and_cast_generic_fn_ptr_t,
+                         CopyAndCastGenericFactory, num_types>
+        dtb_generic;
+    dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCast1DFactory;
+    DispatchTableBuilder<copy_and_cast_1d_fn_ptr_t, CopyAndCast1DFactory,
+                         num_types>
+        dtb_1d;
+    dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
new file mode 100644
index 000000000000..d2e07b08d38f
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
@@ -0,0 +1,53 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event> copy_usm_ndarray_into_usm_ndarray(
+    const dpctl::tensor::usm_ndarray &src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_and_cast_usm_to_usm_dispatch_tables();
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/copy_as_contig.cpp b/dpnp/tensor/libtensor/source/copy_as_contig.cpp
new file mode 100644
index 000000000000..c1c4b740dfba
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/copy_as_contig.cpp
@@ -0,0 +1,782 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <iterator>
+#include <numeric>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/copy_as_contiguous.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "copy_as_contig.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t;
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_array_impl_fn_ptr_t;
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+static as_c_contiguous_array_impl_fn_ptr_t
+    as_c_contig_array_dispatch_vector[td_ns::num_types];
+
+static as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t
+    as_c_contig_1d_batch_of_square_matrices_dispatch_vector[td_ns::num_types];
+
+static as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t
+    as_c_contig_nd_batch_of_square_matrices_dispatch_vector[td_ns::num_types];
+
+void init_copy_as_contig_dispatch_vectors(void)
+{
+
+    using dpctl::tensor::kernels::copy_as_contig::
+        AsCContig1DBatchOfSquareMatricesFactory;
+    using dpctl::tensor::kernels::copy_as_contig::AsCContigFactory;
+    using dpctl::tensor::kernels::copy_as_contig::
+        AsCContigNDBatchOfSquareMatricesFactory;
+    using td_ns::DispatchVectorBuilder;
+
+    // Generic to c-contig
+    DispatchVectorBuilder<as_c_contiguous_array_impl_fn_ptr_t, AsCContigFactory,
+                          td_ns::num_types>
+        dtv_as_c_contig_array;
+
+    dtv_as_c_contig_array.populate_dispatch_vector(
+        as_c_contig_array_dispatch_vector);
+
+    // 1D batch of square views into F-contig matrices to c-contig array
+    DispatchVectorBuilder<
+        as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t,
+        AsCContig1DBatchOfSquareMatricesFactory, td_ns::num_types>
+        dtv_as_c_contig_1d_batch_of_square_matrices;
+
+    dtv_as_c_contig_1d_batch_of_square_matrices.populate_dispatch_vector(
+        as_c_contig_1d_batch_of_square_matrices_dispatch_vector);
+
+    // ND batch of square views into F-contig matrices to c-contig array
+    DispatchVectorBuilder<
+        as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t,
+        AsCContigNDBatchOfSquareMatricesFactory, td_ns::num_types>
+        dtv_as_c_contig_nd_batch_of_square_matrices;
+
+    dtv_as_c_contig_nd_batch_of_square_matrices.populate_dispatch_vector(
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector);
+}
+
+namespace
+{
+
+template <typename dimT>
+std::size_t get_nelems(const std::vector<dimT> &shape)
+{
+    auto mult_fn = [](std::size_t prod, const dimT &term) -> std::size_t {
+        return prod * static_cast<std::size_t>(term);
+    };
+
+    static constexpr std::size_t unit{1};
+
+    const std::size_t nelems =
+        std::accumulate(std::begin(shape), std::end(shape), unit, mult_fn);
+    return nelems;
+}
+
+} // end of anonymous namespace
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends);
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &exec_q,
+                   const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is C-contiguous.
+     */
+    const int src_nd = src.get_ndim();
+    const int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    if (src_shape_vec != dst_shape_vec) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_nd >= 2) {
+        auto n = dst_shape_vec.back();
+        if (n == dst_shape_vec[src_nd - 2]) {
+            static constexpr auto unit_stride = py::ssize_t(1);
+            if (src_strides_vec[src_nd - 2] == unit_stride) {
+                return py_as_c_contig_f2c(src, dst, exec_q, depends);
+            }
+        }
+    }
+
+    const std::size_t nelems = get_nelems(src_shape_vec);
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // simplify iteration space
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+
+    // nd, simplified_* and *_offset are modified by reference
+    simplify_iteration_space(nd, src_shape_vec.data(), src_strides_vec,
+                             dst.get_strides_vector(),
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    auto ptr_size_event_tuple =
+        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_stride = shape_stride_owner.get();
+
+    auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(),
+                    dst.get_data(), all_depends);
+
+    const auto &temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev},
+                                                     shape_stride_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends);
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &exec_q,
+                   const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is F-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    if (src_shape_vec != dst_shape_vec) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_f_contiguous()) {
+        throw py::value_error("Destination array must be F-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_nd >= 2) {
+        auto n = dst_shape_vec.front();
+        if (n == dst_shape_vec[1]) {
+            static constexpr auto unit_stride = py::ssize_t(1);
+            if (src_strides_vec[1] == unit_stride) {
+                return py_as_f_contig_c2f(src, dst, exec_q, depends);
+            }
+        }
+    }
+
+    const std::size_t nelems = get_nelems(src_shape_vec);
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // simplify batch iteration space
+    // NB: simplification reverses dst strides to C contig,
+    // it also reverses simplified_shape and simplified_src_strides
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+
+    // nd, simplified_* and *_offset are modified by reference
+    simplify_iteration_space(nd, src_shape_vec.data(), src_strides_vec,
+                             dst.get_strides_vector(),
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    auto ptr_size_event_tuple =
+        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_stride = shape_stride_owner.get();
+
+    auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(),
+                    dst.get_data(), all_depends);
+
+    const auto &temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev},
+                                                     shape_stride_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is C-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same.");
+    }
+    if (src_nd < 2) {
+        throw py::value_error("Arrays must have 2 or more axes");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    std::size_t nelems{1};
+    bool equal_shapes = true;
+
+    for (int i = 0; equal_shapes && (i < src_nd); ++i) {
+        auto sh_i = src_shape_vec[i];
+        equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]);
+        nelems *= static_cast<std::size_t>(sh_i);
+    }
+
+    if (!equal_shapes) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    const auto n = src_shape_vec.back();
+    if (src_shape_vec[src_nd - 2] != n) {
+        throw py::value_error("Matrices must be square");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_strides_vec[src_nd - 2] != py::ssize_t(1)) {
+        throw py::value_error("Unexpected destination array layout");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    const auto &dst_strides_vec = dst.get_strides_vector();
+
+    const std::size_t batch_nelems =
+        (src_nd == 2) ? std::size_t(1) : (nelems / (n * n));
+    const py::ssize_t dst_batch_step =
+        (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[src_nd - 3];
+
+    std::vector<py::ssize_t> src_batch_strides_vec;
+    std::vector<py::ssize_t> dst_batch_strides_vec;
+    std::vector<py::ssize_t> batch_shape_vec;
+
+    if (src_nd == 2) {
+        batch_shape_vec.push_back(py::ssize_t(1));
+        src_batch_strides_vec.push_back(py::ssize_t(0));
+        dst_batch_strides_vec.push_back(dst_batch_step);
+    }
+    else {
+        batch_shape_vec.insert(std::end(batch_shape_vec),
+                               std::begin(src_shape_vec),
+                               std::end(src_shape_vec) - 2);
+        src_batch_strides_vec.insert(std::end(src_batch_strides_vec),
+                                     std::begin(src_strides_vec),
+                                     std::end(src_strides_vec) - 2);
+        dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec),
+                                     std::begin(dst_strides_vec),
+                                     std::end(dst_strides_vec) - 2);
+    }
+
+    // simplify batch iteration space
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = static_cast<int>(batch_shape_vec.size());
+
+    // nd, simplified_* and *_offset are modified by reference
+    simplify_iteration_space(nd, batch_shape_vec.data(), src_batch_strides_vec,
+                             dst_batch_strides_vec,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    if (1 == nd) {
+        const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
+        if ((simplified_shape.front() != expected_dim) ||
+            (simplified_dst_strides.front() != dst_batch_step)) {
+            throw std::runtime_error(
+                "Unexpected result of simplifying iteration space, 2");
+        }
+
+        auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector
+            [src_type_id];
+        const py::ssize_t src_batch_step = simplified_src_strides.front();
+
+        sycl::event ascontig_ev =
+            impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n,
+                    src.get_data(), src_strides_vec.back(), dst.get_data(),
+                    dst_strides_vec[src_nd - 2], depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev);
+    }
+
+    auto impl_fn =
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step,
+                n, src.get_data(), src_strides_vec.back(), dst.get_data(),
+                dst_strides_vec[src_nd - 2], all_depends);
+
+    // async free of shape_strides temporary
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {ascontig_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src,
+                       const dpctl::tensor::usm_ndarray &dst,
+                       sycl::queue &exec_q,
+                       const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is F-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same.");
+    }
+    if (src_nd < 2) {
+        throw py::value_error("Arrays must have 2 or more axes");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_f_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    std::size_t nelems{1};
+    bool equal_shapes = true;
+
+    for (int i = 0; equal_shapes && (i < src_nd); ++i) {
+        auto sh_i = src_shape_vec[i];
+        equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]);
+        nelems *= static_cast<std::size_t>(sh_i);
+    }
+
+    if (!equal_shapes) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    const auto n = dst_shape_vec.front();
+    if (dst_shape_vec[1] != n) {
+        throw py::value_error("Matrices must be square");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_strides_vec[1] != py::ssize_t(1)) {
+        throw py::value_error("Unexpected destination array layout");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    const auto &dst_strides_vec = dst.get_strides_vector();
+
+    const std::size_t batch_nelems =
+        (src_nd == 2) ? std::size_t(1) : (nelems / (n * n));
+    const py::ssize_t dst_batch_step =
+        (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[2];
+
+    std::vector<py::ssize_t> src_batch_strides_vec;
+    std::vector<py::ssize_t> dst_batch_strides_vec;
+    std::vector<py::ssize_t> batch_shape_vec;
+
+    if (src_nd == 2) {
+        batch_shape_vec.push_back(py::ssize_t(1));
+        src_batch_strides_vec.push_back(py::ssize_t(0));
+        dst_batch_strides_vec.push_back(dst_batch_step);
+    }
+    else {
+        batch_shape_vec.insert(std::end(batch_shape_vec),
+                               std::begin(src_shape_vec) + 2,
+                               std::end(src_shape_vec));
+        src_batch_strides_vec.insert(std::end(src_batch_strides_vec),
+                                     std::begin(src_strides_vec) + 2,
+                                     std::end(src_strides_vec));
+        dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec),
+                                     std::begin(dst_strides_vec) + 2,
+                                     std::end(dst_strides_vec));
+    }
+
+    // simplify batch iteration space
+    // NB: simplification reverses dst strides to C contig,
+    // it also reverses simplified_shape and simplified_src_strides
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = static_cast<int>(batch_shape_vec.size());
+
+    // nd, simplified_* and *_offset are modified by reference
+    simplify_iteration_space(nd, batch_shape_vec.data(), src_batch_strides_vec,
+                             dst_batch_strides_vec,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    if (1 == nd) {
+        const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
+        if ((simplified_shape.front() != expected_dim) ||
+            (simplified_dst_strides.front() != dst_batch_step)) {
+            throw std::runtime_error(
+                "Unexpected result of simplifying iteration space, 2");
+        }
+
+        auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector
+            [src_type_id];
+        const py::ssize_t src_batch_step = simplified_src_strides.front();
+
+        sycl::event ascontig_ev =
+            impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n,
+                    src.get_data(), src_strides_vec.front(), dst.get_data(),
+                    dst_strides_vec[1], depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev);
+    }
+
+    auto impl_fn =
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step,
+                n, src.get_data(), src_strides_vec.front(), dst.get_data(),
+                dst_strides_vec[1], all_depends);
+
+    // async free of shape_strides
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {ascontig_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/copy_as_contig.hpp b/dpnp/tensor/libtensor/source/copy_as_contig.hpp
new file mode 100644
index 000000000000..bfe3159c8813
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/copy_as_contig.hpp
@@ -0,0 +1,54 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::py_internal
+{
+
+std::pair<sycl::event, sycl::event>
+    py_as_c_contig(const dpctl::tensor::usm_ndarray &,
+                   const dpctl::tensor::usm_ndarray &,
+                   sycl::queue &,
+                   const std::vector<sycl::event> &);
+
+std::pair<sycl::event, sycl::event>
+    py_as_f_contig(const dpctl::tensor::usm_ndarray &,
+                   const dpctl::tensor::usm_ndarray &,
+                   sycl::queue &,
+                   const std::vector<sycl::event> &);
+
+void init_copy_as_contig_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/copy_for_reshape.cpp b/dpnp/tensor/libtensor/source/copy_for_reshape.cpp
new file mode 100644
index 000000000000..524bfcfdb98b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/copy_for_reshape.cpp
@@ -0,0 +1,184 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "copy_for_reshape.hpp"
+#include "kernels/copy_and_cast.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_for_reshape_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+// define static vector
+static copy_for_reshape_fn_ptr_t
+    copy_for_reshape_generic_dispatch_vector[td_ns::num_types];
+
+/*
+ * Copies src into dst (same data type) of different shapes by using flat
+ * iterations.
+ *
+ * Equivalent to the following loop:
+ *
+ * for i for range(src.size):
+ *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
+ */
+std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+{
+    py::ssize_t src_nelems = src.get_size();
+    py::ssize_t dst_nelems = dst.get_size();
+
+    // Must have the same number of elements
+    if (src_nelems != dst_nelems) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "have the same number of elements.");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check same contexts
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    // dimensions may be different
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    auto fn = copy_for_reshape_generic_dispatch_vector[type_id];
+
+    auto src_shape = src.get_shape_vector();
+    auto src_strides = src.get_strides_vector();
+
+    auto dst_shape = dst.get_shape_vector();
+    auto dst_strides = dst.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_shape, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, src_shape, src_strides, dst_shape,
+        dst_strides);
+    auto copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_reshape_event =
+        fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data,
+           dst_data, all_deps);
+
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {copy_for_reshape_event}, shape_strides_owner);
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_reshape_event);
+}
+
+void init_copy_for_reshape_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyForReshapeGenericFactory;
+
+    DispatchVectorBuilder<copy_for_reshape_fn_ptr_t,
+                          CopyForReshapeGenericFactory, num_types>
+        dvb;
+    dvb.populate_dispatch_vector(copy_for_reshape_generic_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/copy_for_reshape.hpp b/dpnp/tensor/libtensor/source/copy_for_reshape.hpp
new file mode 100644
index 000000000000..c5af885ad6cd
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/copy_for_reshape.hpp
@@ -0,0 +1,54 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_for_reshape_dispatch_vectors();
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/copy_for_roll.cpp b/dpnp/tensor/libtensor/source/copy_for_roll.cpp
new file mode 100644
index 000000000000..7742c1c96a4e
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/copy_for_roll.cpp
@@ -0,0 +1,399 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "copy_for_roll.hpp"
+#include "kernels/copy_and_cast.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_contig_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_for_roll_ndshift_strided_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_strided_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+// define static vector
+static copy_for_roll_strided_fn_ptr_t
+    copy_for_roll_strided_dispatch_vector[td_ns::num_types];
+
+static copy_for_roll_contig_fn_ptr_t
+    copy_for_roll_contig_dispatch_vector[td_ns::num_types];
+
+static copy_for_roll_ndshift_strided_fn_ptr_t
+    copy_for_roll_ndshift_dispatch_vector[td_ns::num_types];
+
+/*
+ * Copies src into dst (same data type) of different shapes by using flat
+ * iterations.
+ *
+ * Equivalent to the following loop:
+ *
+ * for i for range(src.size):
+ *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
+ */
+std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 py::ssize_t shift,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    // Must have the same number of dimensions
+    if (src_nd != dst_nd) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same number of dimensions.");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same shape.");
+    }
+
+    py::ssize_t src_nelems = src.get_size();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check same contexts
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    const bool is_src_c_contig = src.is_c_contiguous();
+    const bool is_src_f_contig = src.is_f_contiguous();
+
+    const bool is_dst_c_contig = dst.is_c_contiguous();
+    const bool is_dst_f_contig = dst.is_f_contiguous();
+
+    const bool both_c_contig = is_src_c_contig && is_dst_c_contig;
+    const bool both_f_contig = is_src_f_contig && is_dst_f_contig;
+
+    // normalize shift parameter to be 0 <= offset < src_nelems
+    std::size_t offset =
+        (shift > 0) ? (shift % src_nelems) : src_nelems + (shift % src_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    if (both_c_contig || both_f_contig) {
+        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
+
+        if (fn != nullptr) {
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event copy_for_roll_ev =
+                fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data,
+                   zero_offset, depends);
+
+            sycl::event ht_ev =
+                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
+
+            return std::make_pair(ht_ev, copy_for_roll_ev);
+        }
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+    const py::ssize_t *shape = src_shape_ptr;
+
+    // nd, simplified_* and *_offset are modified by reference
+    simplify_iteration_space(nd, shape, src_strides, dst_strides,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (nd == 1 && simplified_src_strides[0] == 1 &&
+        simplified_dst_strides[0] == 1) {
+        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
+
+        if (fn != nullptr) {
+
+            sycl::event copy_for_roll_ev =
+                fn(exec_q, offset, src_nelems, src_data, src_offset, dst_data,
+                   dst_offset, depends);
+
+            sycl::event ht_ev =
+                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
+
+            return std::make_pair(ht_ev, copy_for_roll_ev);
+        }
+    }
+
+    auto fn = copy_for_roll_strided_dispatch_vector[type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_roll_event =
+        fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data,
+           src_offset, dst_data, dst_offset, all_deps);
+
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {copy_for_roll_event}, shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_roll_event);
+}
+
+std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 const std::vector<py::ssize_t> &shifts,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    // Must have the same number of dimensions
+    if (src_nd != dst_nd) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same number of dimensions.");
+    }
+
+    if (static_cast<std::size_t>(src_nd) != shifts.size()) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires shifts to "
+            "contain an integral shift for each array dimension.");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same shape.");
+    }
+
+    py::ssize_t src_nelems = src.get_size();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check for compatible queues
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    std::vector<py::ssize_t> normalized_shifts{};
+    normalized_shifts.reserve(src_nd);
+
+    for (int i = 0; i < src_nd; ++i) {
+        // normalize shift parameter to be 0 <= offset < dim
+        py::ssize_t dim = src_shape_ptr[i];
+        std::size_t offset =
+            (shifts[i] >= 0) ? (shifts[i] % dim) : dim + (shifts[i] % dim);
+
+        normalized_shifts.push_back(offset);
+    }
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+    auto const &common_shape = src.get_shape_vector();
+
+    static constexpr py::ssize_t src_offset = 0;
+    static constexpr py::ssize_t dst_offset = 0;
+
+    auto fn = copy_for_roll_ndshift_dispatch_vector[type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, common_shape, src_strides, dst_strides,
+        normalized_shifts);
+    auto shape_strides_shifts_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides_shifts = shape_strides_shifts_owner.get();
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_roll_event =
+        fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data,
+           src_offset, dst_data, dst_offset, all_deps);
+
+    auto temporaries_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {copy_for_roll_event}, shape_strides_shifts_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_roll_event);
+}
+
+void init_copy_for_roll_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollStridedFactory;
+
+    DispatchVectorBuilder<copy_for_roll_strided_fn_ptr_t,
+                          CopyForRollStridedFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(copy_for_roll_strided_dispatch_vector);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollContigFactory;
+    DispatchVectorBuilder<copy_for_roll_contig_fn_ptr_t,
+                          CopyForRollContigFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(copy_for_roll_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollNDShiftFactory;
+    DispatchVectorBuilder<copy_for_roll_ndshift_strided_fn_ptr_t,
+                          CopyForRollNDShiftFactory, num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(copy_for_roll_ndshift_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/copy_for_roll.hpp b/dpnp/tensor/libtensor/source/copy_for_roll.hpp
new file mode 100644
index 000000000000..cffbf9f6f0d6
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/copy_for_roll.hpp
@@ -0,0 +1,65 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 py::ssize_t shift,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 const std::vector<py::ssize_t> &shifts,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_for_roll_dispatch_vectors();
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
new file mode 100644
index 000000000000..e97e8aeb1ca1
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -0,0 +1,368 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/copy_and_cast.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "copy_numpy_ndarray_into_usm_ndarray.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_and_cast_from_host_blocking_fn_ptr_t;
+
+static copy_and_cast_from_host_blocking_fn_ptr_t
+    copy_and_cast_from_host_blocking_dispatch_table[td_ns::num_types]
+                                                   [td_ns::num_types];
+
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_and_cast_from_host_contig_blocking_fn_ptr_t;
+
+static copy_and_cast_from_host_contig_blocking_fn_ptr_t
+    copy_and_cast_from_host_contig_blocking_dispatch_table[td_ns::num_types]
+                                                          [td_ns::num_types];
+
+void copy_numpy_ndarray_into_usm_ndarray(
+    const py::array &npy_src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends)
+{
+    int src_ndim = npy_src.ndim();
+    int dst_ndim = dst.get_ndim();
+
+    if (src_ndim != dst_ndim) {
+        throw py::value_error("Source ndarray and destination usm_ndarray have "
+                              "different array ranks, "
+                              "i.e. different number of indices needed to "
+                              "address array elements.");
+    }
+
+    const py::ssize_t *src_shape = npy_src.shape();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+    for (int i = 0; shapes_equal && (i < src_ndim); ++i) {
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+    }
+
+    if (!shapes_equal) {
+        throw py::value_error("Source ndarray and destination usm_ndarray have "
+                              "difference shapes.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return;
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error("Execution queue is not compatible with the "
+                              "allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // here we assume that NumPy's type numbers agree with ours for types
+    // supported in both
+    int src_typenum =
+        py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num;
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::buffer_info src_pybuf = npy_src.request();
+    const char *const src_data = static_cast<const char *const>(src_pybuf.ptr);
+    char *dst_data = dst.get_data();
+
+    int src_flags = npy_src.flags();
+
+    // check for applicability of special cases:
+    //      (same type && (both C-contiguous || both F-contiguous)
+    const bool both_c_contig =
+        ((src_flags & py::array::c_style) && dst.is_c_contiguous());
+    const bool both_f_contig =
+        ((src_flags & py::array::f_style) && dst.is_f_contiguous());
+
+    const bool same_data_types = (src_type_id == dst_type_id);
+
+    if (both_c_contig || both_f_contig) {
+        if (same_data_types) {
+            int src_elem_size = npy_src.itemsize();
+
+            sycl::event copy_ev =
+                exec_q.memcpy(static_cast<void *>(dst_data),
+                              static_cast<const void *>(src_data),
+                              src_nelems * src_elem_size, depends);
+
+            {
+                // wait for copy_ev to complete
+                // release GIL to allow other threads (host_tasks)
+                // a chance to acquire GIL
+                py::gil_scoped_release lock{};
+                copy_ev.wait();
+            }
+
+            return;
+        }
+        else {
+            py::gil_scoped_release lock{};
+
+            auto copy_and_cast_from_host_contig_blocking_fn =
+                copy_and_cast_from_host_contig_blocking_dispatch_table
+                    [dst_type_id][src_type_id];
+
+            static constexpr py::ssize_t zero_offset(0);
+
+            copy_and_cast_from_host_contig_blocking_fn(
+                exec_q, src_nelems, src_data, zero_offset, dst_data,
+                zero_offset, depends);
+
+            return;
+        }
+    }
+
+    auto const &dst_strides =
+        dst.get_strides_vector(); // N.B.: strides in elements
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_ndim;
+    const py::ssize_t *shape = src_shape;
+
+    const py::ssize_t *src_strides_p =
+        npy_src.strides();                         // N.B.: strides in bytes
+    py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes
+
+    bool is_src_c_contig = ((src_flags & py::array::c_style) != 0);
+    bool is_src_f_contig = ((src_flags & py::array::f_style) != 0);
+
+    shT src_strides_in_elems;
+    if (src_strides_p) {
+        src_strides_in_elems.resize(nd);
+        // copy and convert strides from bytes to elements
+        std::transform(
+            src_strides_p, src_strides_p + nd, std::begin(src_strides_in_elems),
+            [src_itemsize](py::ssize_t el) {
+                py::ssize_t q = el / src_itemsize;
+                if (q * src_itemsize != el) {
+                    throw std::runtime_error(
+                        "NumPy array strides are not multiple of itemsize");
+                }
+                return q;
+            });
+    }
+    else {
+        if (is_src_c_contig) {
+            src_strides_in_elems =
+                dpctl::tensor::c_contiguous_strides(nd, src_shape);
+        }
+        else if (is_src_f_contig) {
+            src_strides_in_elems =
+                dpctl::tensor::f_contiguous_strides(nd, src_shape);
+        }
+        else {
+            throw py::value_error("NumPy source array has null strides but is "
+                                  "neither C- nor F-contiguous.");
+        }
+    }
+
+    // nd, simplified_* vectors and offsets are modified by reference
+    simplify_iteration_space(nd, shape, src_strides_in_elems, dst_strides,
+                             // outputs
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+    assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
+    assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+    // handle nd == 0
+    if (nd == 0) {
+        nd = 1;
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(1);
+
+        simplified_src_strides.reserve(nd);
+        simplified_src_strides.push_back(1);
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.push_back(1);
+    }
+
+    const bool is_contig_vector =
+        ((nd == 1) && (simplified_src_strides.front() == 1) &&
+         (simplified_dst_strides.front() == 1));
+
+    const bool can_use_memcpy = (same_data_types && is_contig_vector &&
+                                 (src_offset == 0) && (dst_offset == 0));
+
+    if (can_use_memcpy) {
+        int src_elem_size = npy_src.itemsize();
+
+        sycl::event copy_ev = exec_q.memcpy(
+            static_cast<void *>(dst_data), static_cast<const void *>(src_data),
+            src_nelems * src_elem_size, depends);
+
+        {
+            // wait for copy_ev to complete
+            // release GIL to allow other threads (host_tasks)
+            // a chance to acquire GIL
+            py::gil_scoped_release lock{};
+
+            copy_ev.wait();
+        }
+
+        return;
+    }
+
+    // Minimum and maximum element offsets for source np.ndarray
+    py::ssize_t npy_src_min_nelem_offset(src_offset);
+    py::ssize_t npy_src_max_nelem_offset(src_offset);
+    for (int i = 0; i < nd; ++i) {
+        if (simplified_src_strides[i] < 0) {
+            npy_src_min_nelem_offset +=
+                simplified_src_strides[i] * (simplified_shape[i] - 1);
+        }
+        else {
+            npy_src_max_nelem_offset +=
+                simplified_src_strides[i] * (simplified_shape[i] - 1);
+        }
+    }
+
+    if (is_contig_vector) {
+        // release GIL for the blocking call
+        py::gil_scoped_release lock{};
+
+        auto copy_and_cast_from_host_contig_blocking_fn =
+            copy_and_cast_from_host_contig_blocking_dispatch_table[dst_type_id]
+                                                                  [src_type_id];
+
+        copy_and_cast_from_host_contig_blocking_fn(exec_q, src_nelems, src_data,
+                                                   src_offset, dst_data,
+                                                   dst_offset, depends);
+
+        return;
+    }
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(1);
+
+    // Copy shape strides into device memory
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    {
+        // release GIL for the blocking call
+        py::gil_scoped_release lock{};
+
+        // Get implementation function pointer
+        auto copy_and_cast_from_host_blocking_fn =
+            copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
+                                                           [src_type_id];
+
+        copy_and_cast_from_host_blocking_fn(
+            exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
+            npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
+            dst_offset, depends, {copy_shape_ev});
+
+        // invoke USM deleter in smart pointer while GIL is held
+        shape_strides_owner.reset(nullptr);
+    }
+
+    return;
+}
+
+void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastFromHostFactory;
+
+    DispatchTableBuilder<copy_and_cast_from_host_blocking_fn_ptr_t,
+                         CopyAndCastFromHostFactory, num_types>
+        dtb_copy_from_numpy;
+
+    dtb_copy_from_numpy.populate_dispatch_table(
+        copy_and_cast_from_host_blocking_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::
+        CopyAndCastFromHostContigFactory;
+
+    DispatchTableBuilder<copy_and_cast_from_host_contig_blocking_fn_ptr_t,
+                         CopyAndCastFromHostContigFactory, num_types>
+        dtb_copy_from_numpy_contig;
+
+    dtb_copy_from_numpy_contig.populate_dispatch_table(
+        copy_and_cast_from_host_contig_blocking_dispatch_table);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
new file mode 100644
index 000000000000..f2de95f97cca
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void copy_numpy_ndarray_into_usm_ndarray(
+    const py::array &npy_src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/device_support_queries.cpp b/dpnp/tensor/libtensor/source/device_support_queries.cpp
new file mode 100644
index 000000000000..6026520f3daa
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/device_support_queries.cpp
@@ -0,0 +1,173 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <string>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+
+namespace
+{
+
+std::string _default_device_fp_type(const sycl::device &d)
+{
+    if (d.has(sycl::aspect::fp64)) {
+        return "f8";
+    }
+    else {
+        return "f4";
+    }
+}
+
+int get_numpy_major_version()
+{
+
+    py::module_ numpy = py::module_::import("numpy");
+    py::str version_string = numpy.attr("__version__");
+    py::module_ numpy_lib = py::module_::import("numpy.lib");
+
+    py::object numpy_version = numpy_lib.attr("NumpyVersion")(version_string);
+    int major_version = numpy_version.attr("major").cast<int>();
+
+    return major_version;
+}
+
+std::string _default_device_int_type(const sycl::device &)
+{
+    const int np_ver = get_numpy_major_version();
+
+    if (np_ver >= 2) {
+        return "i8";
+    }
+    else {
+        // code for numpy.dtype('long') to be consistent
+        // with NumPy's default integer type across
+        // platforms.
+        return "l";
+    }
+}
+
+std::string _default_device_uint_type(const sycl::device &)
+{
+    const int np_ver = get_numpy_major_version();
+
+    if (np_ver >= 2) {
+        return "u8";
+    }
+    else {
+        // code for numpy.dtype('long') to be consistent
+        // with NumPy's default integer type across
+        // platforms.
+        return "L";
+    }
+}
+
+std::string _default_device_complex_type(const sycl::device &d)
+{
+    if (d.has(sycl::aspect::fp64)) {
+        return "c16";
+    }
+    else {
+        return "c8";
+    }
+}
+
+std::string _default_device_bool_type(const sycl::device &) { return "b1"; }
+
+std::string _default_device_index_type(const sycl::device &) { return "i8"; }
+
+sycl::device _extract_device(const py::object &arg)
+{
+    auto const &api = dpctl::detail::dpctl_capi::get();
+
+    PyObject *source = arg.ptr();
+    if (api.PySyclQueue_Check_(source)) {
+        const sycl::queue &q = py::cast<sycl::queue>(arg);
+        return q.get_device();
+    }
+    else if (api.PySyclDevice_Check_(source)) {
+        return py::cast<sycl::device>(arg);
+    }
+    else {
+        throw py::type_error(
+            "Expected type `dpctl.SyclQueue` or `dpctl.SyclDevice`.");
+    }
+}
+
+} // namespace
+
+std::string default_device_fp_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_fp_type(d);
+}
+
+std::string default_device_int_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_int_type(d);
+}
+
+std::string default_device_uint_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_uint_type(d);
+}
+
+std::string default_device_bool_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_bool_type(d);
+}
+
+std::string default_device_complex_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_complex_type(d);
+}
+
+std::string default_device_index_type(const py::object &arg)
+{
+    const sycl::device &d = _extract_device(arg);
+    return _default_device_index_type(d);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/device_support_queries.hpp b/dpnp/tensor/libtensor/source/device_support_queries.hpp
new file mode 100644
index 000000000000..adde7aefe3dd
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/device_support_queries.hpp
@@ -0,0 +1,50 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <string>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::string default_device_fp_type(const py::object &);
+extern std::string default_device_int_type(const py::object &);
+extern std::string default_device_uint_type(const py::object &);
+extern std::string default_device_bool_type(const py::object &);
+extern std::string default_device_complex_type(const py::object &);
+extern std::string default_device_index_type(const py::object &);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/abs.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/abs.cpp
new file mode 100644
index 000000000000..067a201099de
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/abs.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "abs.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/abs.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U01: ==== ABS   (x)
+namespace impl
+{
+
+namespace abs_fn_ns = dpctl::tensor::kernels::abs;
+
+static unary_contig_impl_fn_ptr_t abs_contig_dispatch_vector[td_ns::num_types];
+static int abs_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    abs_strided_dispatch_vector[td_ns::num_types];
+
+void populate_abs_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = abs_fn_ns;
+
+    using fn_ns::AbsContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AbsContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(abs_contig_dispatch_vector);
+
+    using fn_ns::AbsStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AbsStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(abs_strided_dispatch_vector);
+
+    using fn_ns::AbsTypeMapFactory;
+    DispatchVectorBuilder<int, AbsTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(abs_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_abs(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_abs_dispatch_vectors();
+        using impl::abs_contig_dispatch_vector;
+        using impl::abs_output_typeid_vector;
+        using impl::abs_strided_dispatch_vector;
+
+        auto abs_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, abs_output_typeid_vector,
+                abs_contig_dispatch_vector, abs_strided_dispatch_vector);
+        };
+        m.def("_abs", abs_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto abs_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, abs_output_typeid_vector);
+        };
+        m.def("_abs_result_type", abs_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/abs.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/abs.hpp
new file mode 100644
index 000000000000..b496f1e694ac
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/abs.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_abs(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/acos.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/acos.cpp
new file mode 100644
index 000000000000..52d962cd828e
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/acos.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "acos.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/acos.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U02: ==== ACOS   (x)
+namespace impl
+{
+
+namespace acos_fn_ns = dpctl::tensor::kernels::acos;
+
+static unary_contig_impl_fn_ptr_t acos_contig_dispatch_vector[td_ns::num_types];
+static int acos_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    acos_strided_dispatch_vector[td_ns::num_types];
+
+void populate_acos_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = acos_fn_ns;
+
+    using fn_ns::AcosContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AcosContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(acos_contig_dispatch_vector);
+
+    using fn_ns::AcosStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AcosStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(acos_strided_dispatch_vector);
+
+    using fn_ns::AcosTypeMapFactory;
+    DispatchVectorBuilder<int, AcosTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(acos_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_acos(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_acos_dispatch_vectors();
+        using impl::acos_contig_dispatch_vector;
+        using impl::acos_output_typeid_vector;
+        using impl::acos_strided_dispatch_vector;
+
+        auto acos_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, acos_output_typeid_vector,
+                acos_contig_dispatch_vector, acos_strided_dispatch_vector);
+        };
+        m.def("_acos", acos_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto acos_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, acos_output_typeid_vector);
+        };
+        m.def("_acos_result_type", acos_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/acos.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/acos.hpp
new file mode 100644
index 000000000000..608b684c4e18
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/acos.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_acos(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/acosh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/acosh.cpp
new file mode 100644
index 000000000000..c2334804e422
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/acosh.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "acosh.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/acosh.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U03: ==== ACOSH   (x)
+namespace impl
+{
+
+namespace acosh_fn_ns = dpctl::tensor::kernels::acosh;
+
+static unary_contig_impl_fn_ptr_t
+    acosh_contig_dispatch_vector[td_ns::num_types];
+static int acosh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    acosh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_acosh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = acosh_fn_ns;
+
+    using fn_ns::AcoshContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AcoshContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(acosh_contig_dispatch_vector);
+
+    using fn_ns::AcoshStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AcoshStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(acosh_strided_dispatch_vector);
+
+    using fn_ns::AcoshTypeMapFactory;
+    DispatchVectorBuilder<int, AcoshTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(acosh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_acosh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_acosh_dispatch_vectors();
+        using impl::acosh_contig_dispatch_vector;
+        using impl::acosh_output_typeid_vector;
+        using impl::acosh_strided_dispatch_vector;
+
+        auto acosh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, acosh_output_typeid_vector,
+                acosh_contig_dispatch_vector, acosh_strided_dispatch_vector);
+        };
+        m.def("_acosh", acosh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto acosh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              acosh_output_typeid_vector);
+        };
+        m.def("_acosh_result_type", acosh_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/acosh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/acosh.hpp
new file mode 100644
index 000000000000..fc74fa99874f
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/acosh.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_acosh(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp
new file mode 100644
index 000000000000..e37fad67e294
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/add.cpp
@@ -0,0 +1,243 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "add.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/add.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B01: ===== ADD (x1, x2)
+namespace impl
+{
+
+namespace add_fn_ns = dpctl::tensor::kernels::add;
+
+static binary_contig_impl_fn_ptr_t add_contig_dispatch_table[td_ns::num_types]
+                                                            [td_ns::num_types];
+
+static int add_output_id_table[td_ns::num_types][td_ns::num_types];
+static int add_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    add_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// add(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    add_contig_matrix_contig_row_broadcast_dispatch_table[td_ns::num_types]
+                                                         [td_ns::num_types];
+
+// add(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    add_contig_row_contig_matrix_broadcast_dispatch_table[td_ns::num_types]
+                                                         [td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    add_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    add_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    add_inplace_row_matrix_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_add_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = add_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::AddTypeMapFactory;
+    DispatchTableBuilder<int, AddTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(add_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::AddStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, AddStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(add_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::AddContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, AddContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(add_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::AddContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        AddContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        add_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::AddContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        AddContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        add_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::AddInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         AddInplaceStridedFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(add_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::AddInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         AddInplaceContigFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(add_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::AddInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         AddInplaceRowMatrixBroadcastFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(add_inplace_row_matrix_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::AddInplaceTypeMapFactory;
+    DispatchTableBuilder<int, AddInplaceTypeMapFactory, num_types> dtb9;
+    dtb9.populate_dispatch_table(add_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_add(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_add_dispatch_tables();
+        using impl::add_contig_dispatch_table;
+        using impl::add_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::add_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::add_output_id_table;
+        using impl::add_strided_dispatch_table;
+
+        auto add_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, add_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                add_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                add_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                add_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                add_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto add_result_type_pyapi = [&](const py::dtype &dtype1,
+                                         const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               add_output_id_table);
+        };
+        m.def("_add", add_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_add_result_type", add_result_type_pyapi, "");
+
+        using impl::add_inplace_contig_dispatch_table;
+        using impl::add_inplace_output_id_table;
+        using impl::add_inplace_row_matrix_dispatch_table;
+        using impl::add_inplace_strided_dispatch_table;
+
+        auto add_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                     sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, add_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                add_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                add_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                add_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_add_inplace", add_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/add.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/add.hpp
new file mode 100644
index 000000000000..0797adb79ddb
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/add.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_add(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/angle.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/angle.cpp
new file mode 100644
index 000000000000..df2b97fe7644
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/angle.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "angle.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/angle.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U43: ==== ANGLE   (x)
+namespace impl
+{
+
+namespace angle_fn_ns = dpctl::tensor::kernels::angle;
+
+static unary_contig_impl_fn_ptr_t
+    angle_contig_dispatch_vector[td_ns::num_types];
+static int angle_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    angle_strided_dispatch_vector[td_ns::num_types];
+
+void populate_angle_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = angle_fn_ns;
+
+    using fn_ns::AngleContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AngleContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(angle_contig_dispatch_vector);
+
+    using fn_ns::AngleStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AngleStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(angle_strided_dispatch_vector);
+
+    using fn_ns::AngleTypeMapFactory;
+    DispatchVectorBuilder<int, AngleTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(angle_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_angle(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_angle_dispatch_vectors();
+        using impl::angle_contig_dispatch_vector;
+        using impl::angle_output_typeid_vector;
+        using impl::angle_strided_dispatch_vector;
+
+        auto angle_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, angle_output_typeid_vector,
+                angle_contig_dispatch_vector, angle_strided_dispatch_vector);
+        };
+        m.def("_angle", angle_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto angle_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              angle_output_typeid_vector);
+        };
+        m.def("_angle_result_type", angle_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/angle.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/angle.hpp
new file mode 100644
index 000000000000..73071b945d7b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/angle.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_angle(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/asin.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/asin.cpp
new file mode 100644
index 000000000000..32d71c67527e
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/asin.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "asin.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/asin.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U04: ==== ASIN   (x)
+namespace impl
+{
+
+namespace asin_fn_ns = dpctl::tensor::kernels::asin;
+
+static unary_contig_impl_fn_ptr_t asin_contig_dispatch_vector[td_ns::num_types];
+static int asin_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    asin_strided_dispatch_vector[td_ns::num_types];
+
+void populate_asin_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = asin_fn_ns;
+
+    using fn_ns::AsinContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AsinContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(asin_contig_dispatch_vector);
+
+    using fn_ns::AsinStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AsinStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(asin_strided_dispatch_vector);
+
+    using fn_ns::AsinTypeMapFactory;
+    DispatchVectorBuilder<int, AsinTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(asin_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_asin(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_asin_dispatch_vectors();
+        using impl::asin_contig_dispatch_vector;
+        using impl::asin_output_typeid_vector;
+        using impl::asin_strided_dispatch_vector;
+
+        auto asin_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, asin_output_typeid_vector,
+                asin_contig_dispatch_vector, asin_strided_dispatch_vector);
+        };
+        m.def("_asin", asin_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto asin_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, asin_output_typeid_vector);
+        };
+        m.def("_asin_result_type", asin_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/asin.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/asin.hpp
new file mode 100644
index 000000000000..39230000bdfc
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/asin.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_asin(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/asinh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/asinh.cpp
new file mode 100644
index 000000000000..47f8a7dbf190
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/asinh.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "asinh.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/asinh.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U05: ==== ASINH   (x)
+namespace impl
+{
+
+namespace asinh_fn_ns = dpctl::tensor::kernels::asinh;
+
+static unary_contig_impl_fn_ptr_t
+    asinh_contig_dispatch_vector[td_ns::num_types];
+static int asinh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    asinh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_asinh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = asinh_fn_ns;
+
+    using fn_ns::AsinhContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AsinhContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(asinh_contig_dispatch_vector);
+
+    using fn_ns::AsinhStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AsinhStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(asinh_strided_dispatch_vector);
+
+    using fn_ns::AsinhTypeMapFactory;
+    DispatchVectorBuilder<int, AsinhTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(asinh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_asinh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_asinh_dispatch_vectors();
+        using impl::asinh_contig_dispatch_vector;
+        using impl::asinh_output_typeid_vector;
+        using impl::asinh_strided_dispatch_vector;
+
+        auto asinh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, asinh_output_typeid_vector,
+                asinh_contig_dispatch_vector, asinh_strided_dispatch_vector);
+        };
+        m.def("_asinh", asinh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto asinh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              asinh_output_typeid_vector);
+        };
+        m.def("_asinh_result_type", asinh_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/asinh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/asinh.hpp
new file mode 100644
index 000000000000..0d761f082ae3
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/asinh.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_asinh(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/atan.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/atan.cpp
new file mode 100644
index 000000000000..74ee82edbbc9
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/atan.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "atan.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/atan.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U06: ==== ATAN   (x)
+namespace impl
+{
+
+namespace atan_fn_ns = dpctl::tensor::kernels::atan;
+
+static unary_contig_impl_fn_ptr_t atan_contig_dispatch_vector[td_ns::num_types];
+static int atan_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    atan_strided_dispatch_vector[td_ns::num_types];
+
+void populate_atan_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = atan_fn_ns;
+
+    using fn_ns::AtanContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AtanContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(atan_contig_dispatch_vector);
+
+    using fn_ns::AtanStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AtanStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(atan_strided_dispatch_vector);
+
+    using fn_ns::AtanTypeMapFactory;
+    DispatchVectorBuilder<int, AtanTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(atan_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_atan(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_atan_dispatch_vectors();
+        using impl::atan_contig_dispatch_vector;
+        using impl::atan_output_typeid_vector;
+        using impl::atan_strided_dispatch_vector;
+
+        auto atan_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, atan_output_typeid_vector,
+                atan_contig_dispatch_vector, atan_strided_dispatch_vector);
+        };
+        m.def("_atan", atan_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto atan_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, atan_output_typeid_vector);
+        };
+        m.def("_atan_result_type", atan_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/atan.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/atan.hpp
new file mode 100644
index 000000000000..c4eb3f3baf92
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/atan.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_atan(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/atan2.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/atan2.cpp
new file mode 100644
index 000000000000..60bb2e081fef
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/atan2.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "atan2.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/atan2.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B02: ===== ATAN2 (x1, x2)
+namespace impl
+{
+namespace atan2_fn_ns = dpctl::tensor::kernels::atan2;
+
+static binary_contig_impl_fn_ptr_t
+    atan2_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int atan2_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    atan2_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_atan2_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = atan2_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::Atan2TypeMapFactory;
+    DispatchTableBuilder<int, Atan2TypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(atan2_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::Atan2StridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, Atan2StridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(atan2_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::Atan2ContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, Atan2ContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(atan2_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_atan2(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_atan2_dispatch_tables();
+        using impl::atan2_contig_dispatch_table;
+        using impl::atan2_output_id_table;
+        using impl::atan2_strided_dispatch_table;
+
+        auto atan2_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                               const arrayT &dst, sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, atan2_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                atan2_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                atan2_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto atan2_result_type_pyapi = [&](const py::dtype &dtype1,
+                                           const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               atan2_output_id_table);
+        };
+        m.def("_atan2", atan2_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_atan2_result_type", atan2_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/atan2.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/atan2.hpp
new file mode 100644
index 000000000000..5bdf9b74db2e
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/atan2.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_atan2(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/atanh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/atanh.cpp
new file mode 100644
index 000000000000..2857f9ab8c10
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/atanh.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "atanh.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/atanh.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U07: ==== ATANH   (x)
+namespace impl
+{
+
+namespace atanh_fn_ns = dpctl::tensor::kernels::atanh;
+
+static unary_contig_impl_fn_ptr_t
+    atanh_contig_dispatch_vector[td_ns::num_types];
+static int atanh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    atanh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_atanh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = atanh_fn_ns;
+
+    using fn_ns::AtanhContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AtanhContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(atanh_contig_dispatch_vector);
+
+    using fn_ns::AtanhStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AtanhStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(atanh_strided_dispatch_vector);
+
+    using fn_ns::AtanhTypeMapFactory;
+    DispatchVectorBuilder<int, AtanhTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(atanh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_atanh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_atanh_dispatch_vectors();
+        using impl::atanh_contig_dispatch_vector;
+        using impl::atanh_output_typeid_vector;
+        using impl::atanh_strided_dispatch_vector;
+
+        auto atanh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, atanh_output_typeid_vector,
+                atanh_contig_dispatch_vector, atanh_strided_dispatch_vector);
+        };
+        m.def("_atanh", atanh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto atanh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              atanh_output_typeid_vector);
+        };
+        m.def("_atanh_result_type", atanh_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/atanh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/atanh.hpp
new file mode 100644
index 000000000000..5604e48deef6
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/atanh.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_atanh(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp
new file mode 100644
index 000000000000..3976f480ff6d
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp
@@ -0,0 +1,206 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "bitwise_and.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/bitwise_and.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B03: ===== BITWISE_AND (x1, x2)
+namespace impl
+{
+namespace bitwise_and_fn_ns = dpctl::tensor::kernels::bitwise_and;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int bitwise_and_output_id_table[td_ns::num_types][td_ns::num_types];
+static int bitwise_and_inplace_output_id_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_and_inplace_contig_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_and_inplace_strided_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_bitwise_and_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_and_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseAndTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseAndTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_and_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseAndStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseAndStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_and_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseAndContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseAndContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_and_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseAndInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseAndInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(bitwise_and_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseAndInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseAndInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(bitwise_and_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::BitwiseAndInplaceTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseAndInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(bitwise_and_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_bitwise_and(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_and_dispatch_tables();
+        using impl::bitwise_and_contig_dispatch_table;
+        using impl::bitwise_and_output_id_table;
+        using impl::bitwise_and_strided_dispatch_table;
+
+        auto bitwise_and_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, bitwise_and_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_and_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_and_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_and_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               bitwise_and_output_id_table);
+        };
+        m.def("_bitwise_and", bitwise_and_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_bitwise_and_result_type", bitwise_and_result_type_pyapi, "");
+
+        using impl::bitwise_and_inplace_contig_dispatch_table;
+        using impl::bitwise_and_inplace_output_id_table;
+        using impl::bitwise_and_inplace_strided_dispatch_table;
+
+        auto bitwise_and_inplace_pyapi = [&](const arrayT &src,
+                                             const arrayT &dst,
+                                             sycl::queue &exec_q,
+                                             const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, bitwise_and_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                bitwise_and_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                bitwise_and_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_bitwise_and_inplace", bitwise_and_inplace_pyapi, "",
+              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp
new file mode 100644
index 000000000000..19f29ae8822e
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_bitwise_and(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp
new file mode 100644
index 000000000000..05e7f4eeb61b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp
@@ -0,0 +1,129 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "bitwise_invert.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/bitwise_invert.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U08: ===== BITWISE_INVERT        (x)
+namespace impl
+{
+
+namespace bitwise_invert_fn_ns = dpctl::tensor::kernels::bitwise_invert;
+
+static unary_contig_impl_fn_ptr_t
+    bitwise_invert_contig_dispatch_vector[td_ns::num_types];
+static int bitwise_invert_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    bitwise_invert_strided_dispatch_vector[td_ns::num_types];
+
+void populate_bitwise_invert_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_invert_fn_ns;
+
+    using fn_ns::BitwiseInvertContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t,
+                          BitwiseInvertContigFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(bitwise_invert_contig_dispatch_vector);
+
+    using fn_ns::BitwiseInvertStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t,
+                          BitwiseInvertStridedFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(bitwise_invert_strided_dispatch_vector);
+
+    using fn_ns::BitwiseInvertTypeMapFactory;
+    DispatchVectorBuilder<int, BitwiseInvertTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(bitwise_invert_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_bitwise_invert(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_invert_dispatch_vectors();
+        using impl::bitwise_invert_contig_dispatch_vector;
+        using impl::bitwise_invert_output_typeid_vector;
+        using impl::bitwise_invert_strided_dispatch_vector;
+
+        auto bitwise_invert_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                        sycl::queue &exec_q,
+                                        const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  bitwise_invert_output_typeid_vector,
+                                  bitwise_invert_contig_dispatch_vector,
+                                  bitwise_invert_strided_dispatch_vector);
+        };
+        m.def("_bitwise_invert", bitwise_invert_pyapi, "", py::arg("src"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+
+        auto bitwise_invert_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(
+                dtype, bitwise_invert_output_typeid_vector);
+        };
+        m.def("_bitwise_invert_result_type", bitwise_invert_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp
new file mode 100644
index 000000000000..e20c0df3cf11
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_bitwise_invert(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
new file mode 100644
index 000000000000..c26c9a42864f
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
@@ -0,0 +1,216 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "bitwise_left_shift.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/bitwise_left_shift.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B04: ===== BITWISE_LEFT_SHIFT (x1, x2)
+namespace impl
+{
+namespace bitwise_left_shift_fn_ns = dpctl::tensor::kernels::bitwise_left_shift;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_left_shift_contig_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+static int bitwise_left_shift_output_id_table[td_ns::num_types]
+                                             [td_ns::num_types];
+static int bitwise_left_shift_inplace_output_id_table[td_ns::num_types]
+                                                     [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_left_shift_strided_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_left_shift_inplace_contig_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_left_shift_inplace_strided_dispatch_table[td_ns::num_types]
+                                                     [td_ns::num_types];
+
+void populate_bitwise_left_shift_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_left_shift_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseLeftShiftTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseLeftShiftTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_left_shift_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseLeftShiftStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         BitwiseLeftShiftStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_left_shift_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseLeftShiftContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t,
+                         BitwiseLeftShiftContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_left_shift_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseLeftShiftInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseLeftShiftInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        bitwise_left_shift_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseLeftShiftInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseLeftShiftInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        bitwise_left_shift_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::BitwiseLeftShiftInplaceTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseLeftShiftInplaceTypeMapFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(bitwise_left_shift_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_bitwise_left_shift(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_left_shift_dispatch_tables();
+        using impl::bitwise_left_shift_contig_dispatch_table;
+        using impl::bitwise_left_shift_output_id_table;
+        using impl::bitwise_left_shift_strided_dispatch_table;
+
+        auto bitwise_left_shift_pyapi = [&](const arrayT &src1,
+                                            const arrayT &src2,
+                                            const arrayT &dst,
+                                            sycl::queue &exec_q,
+                                            const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends,
+                bitwise_left_shift_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_left_shift_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_left_shift_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_left_shift_result_type_pyapi =
+            [&](const py::dtype &dtype1, const py::dtype &dtype2) {
+                return py_binary_ufunc_result_type(
+                    dtype1, dtype2, bitwise_left_shift_output_id_table);
+            };
+        m.def("_bitwise_left_shift", bitwise_left_shift_pyapi, "",
+              py::arg("src1"), py::arg("src2"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+        m.def("_bitwise_left_shift_result_type",
+              bitwise_left_shift_result_type_pyapi, "");
+
+        using impl::bitwise_left_shift_inplace_contig_dispatch_table;
+        using impl::bitwise_left_shift_inplace_output_id_table;
+        using impl::bitwise_left_shift_inplace_strided_dispatch_table;
+
+        auto bitwise_left_shift_inplace_pyapi =
+            [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+                const event_vecT &depends = {}) {
+                return py_binary_inplace_ufunc(
+                    src, dst, exec_q, depends,
+                    bitwise_left_shift_inplace_output_id_table,
+                    // function pointers to handle inplace operation on
+                    // contiguous arrays (pointers may be nullptr)
+                    bitwise_left_shift_inplace_contig_dispatch_table,
+                    // function pointers to handle inplace operation on strided
+                    // arrays (most general case)
+                    bitwise_left_shift_inplace_strided_dispatch_table,
+                    // function pointers to handle inplace operation on
+                    // c-contig matrix with c-contig row with broadcasting
+                    // (may be nullptr)
+                    td_ns::NullPtrTable<
+                        binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+            };
+        m.def("_bitwise_left_shift_inplace", bitwise_left_shift_inplace_pyapi,
+              "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp
new file mode 100644
index 000000000000..49a7947d98c3
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_bitwise_left_shift(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp
new file mode 100644
index 000000000000..bbb138c406fb
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp
@@ -0,0 +1,206 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "bitwise_or.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/bitwise_or.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B05: ===== BITWISE_OR (x1, x2)
+namespace impl
+{
+namespace bitwise_or_fn_ns = dpctl::tensor::kernels::bitwise_or;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int bitwise_or_output_id_table[td_ns::num_types][td_ns::num_types];
+static int bitwise_or_inplace_output_id_table[td_ns::num_types]
+                                             [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_or_inplace_contig_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_or_inplace_strided_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+
+void populate_bitwise_or_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_or_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseOrTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseOrTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_or_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseOrStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseOrStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_or_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseOrContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseOrContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_or_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseOrInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseOrInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(bitwise_or_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseOrInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseOrInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(bitwise_or_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::BitwiseOrInplaceTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseOrInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(bitwise_or_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_bitwise_or(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_or_dispatch_tables();
+        using impl::bitwise_or_contig_dispatch_table;
+        using impl::bitwise_or_output_id_table;
+        using impl::bitwise_or_strided_dispatch_table;
+
+        auto bitwise_or_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                    const arrayT &dst, sycl::queue &exec_q,
+                                    const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, bitwise_or_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_or_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_or_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_or_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               bitwise_or_output_id_table);
+        };
+        m.def("_bitwise_or", bitwise_or_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_bitwise_or_result_type", bitwise_or_result_type_pyapi, "");
+
+        using impl::bitwise_or_inplace_contig_dispatch_table;
+        using impl::bitwise_or_inplace_output_id_table;
+        using impl::bitwise_or_inplace_strided_dispatch_table;
+
+        auto bitwise_or_inplace_pyapi = [&](const arrayT &src,
+                                            const arrayT &dst,
+                                            sycl::queue &exec_q,
+                                            const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, bitwise_or_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                bitwise_or_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                bitwise_or_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_bitwise_or_inplace", bitwise_or_inplace_pyapi, "",
+              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp
new file mode 100644
index 000000000000..1e24caa54429
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_bitwise_or(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
new file mode 100644
index 000000000000..099dd56b4484
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
@@ -0,0 +1,217 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "bitwise_right_shift.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/bitwise_right_shift.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B06: ===== BITWISE_RIGHT_SHIFT (x1, x2)
+namespace impl
+{
+namespace bitwise_right_shift_fn_ns =
+    dpctl::tensor::kernels::bitwise_right_shift;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_right_shift_contig_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+
+static int bitwise_right_shift_output_id_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static int bitwise_right_shift_inplace_output_id_table[td_ns::num_types]
+                                                      [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_right_shift_strided_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_right_shift_inplace_contig_dispatch_table[td_ns::num_types]
+                                                     [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_right_shift_inplace_strided_dispatch_table[td_ns::num_types]
+                                                      [td_ns::num_types];
+
+void populate_bitwise_right_shift_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_right_shift_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseRightShiftTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseRightShiftTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_right_shift_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseRightShiftStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         BitwiseRightShiftStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_right_shift_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseRightShiftContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t,
+                         BitwiseRightShiftContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_right_shift_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseRightShiftInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseRightShiftInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        bitwise_right_shift_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseRightShiftInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseRightShiftInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        bitwise_right_shift_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::BitwiseRightShiftInplaceTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseRightShiftInplaceTypeMapFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(bitwise_right_shift_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_bitwise_right_shift(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_right_shift_dispatch_tables();
+        using impl::bitwise_right_shift_contig_dispatch_table;
+        using impl::bitwise_right_shift_output_id_table;
+        using impl::bitwise_right_shift_strided_dispatch_table;
+
+        auto bitwise_right_shift_pyapi = [&](const arrayT &src1,
+                                             const arrayT &src2,
+                                             const arrayT &dst,
+                                             sycl::queue &exec_q,
+                                             const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends,
+                bitwise_right_shift_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_right_shift_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_right_shift_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_right_shift_result_type_pyapi =
+            [&](const py::dtype &dtype1, const py::dtype &dtype2) {
+                return py_binary_ufunc_result_type(
+                    dtype1, dtype2, bitwise_right_shift_output_id_table);
+            };
+        m.def("_bitwise_right_shift", bitwise_right_shift_pyapi, "",
+              py::arg("src1"), py::arg("src2"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+        m.def("_bitwise_right_shift_result_type",
+              bitwise_right_shift_result_type_pyapi, "");
+
+        using impl::bitwise_right_shift_inplace_contig_dispatch_table;
+        using impl::bitwise_right_shift_inplace_output_id_table;
+        using impl::bitwise_right_shift_inplace_strided_dispatch_table;
+
+        auto bitwise_right_shift_inplace_pyapi =
+            [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
+                const event_vecT &depends = {}) {
+                return py_binary_inplace_ufunc(
+                    src, dst, exec_q, depends,
+                    bitwise_right_shift_inplace_output_id_table,
+                    // function pointers to handle inplace operation on
+                    // contiguous arrays (pointers may be nullptr)
+                    bitwise_right_shift_inplace_contig_dispatch_table,
+                    // function pointers to handle inplace operation on strided
+                    // arrays (most general case)
+                    bitwise_right_shift_inplace_strided_dispatch_table,
+                    // function pointers to handle inplace operation on
+                    // c-contig matrix with c-contig row with broadcasting
+                    // (may be nullptr)
+                    td_ns::NullPtrTable<
+                        binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+            };
+        m.def("_bitwise_right_shift_inplace", bitwise_right_shift_inplace_pyapi,
+              "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp
new file mode 100644
index 000000000000..aeb24d73b2fc
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_bitwise_right_shift(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp
new file mode 100644
index 000000000000..9a23fec82e72
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp
@@ -0,0 +1,206 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "bitwise_xor.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/bitwise_xor.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B07: ===== BITWISE_XOR (x1, x2)
+namespace impl
+{
+namespace bitwise_xor_fn_ns = dpctl::tensor::kernels::bitwise_xor;
+
+static binary_contig_impl_fn_ptr_t
+    bitwise_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int bitwise_xor_output_id_table[td_ns::num_types][td_ns::num_types];
+static int bitwise_xor_inplace_output_id_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    bitwise_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    bitwise_xor_inplace_contig_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    bitwise_xor_inplace_strided_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_bitwise_xor_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = bitwise_xor_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::BitwiseXorTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseXorTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(bitwise_xor_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::BitwiseXorStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseXorStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(bitwise_xor_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::BitwiseXorContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseXorContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(bitwise_xor_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::BitwiseXorInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         BitwiseXorInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(bitwise_xor_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::BitwiseXorInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         BitwiseXorInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(bitwise_xor_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::BitwiseXorInplaceTypeMapFactory;
+    DispatchTableBuilder<int, BitwiseXorInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(bitwise_xor_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_bitwise_xor(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_bitwise_xor_dispatch_tables();
+        using impl::bitwise_xor_contig_dispatch_table;
+        using impl::bitwise_xor_output_id_table;
+        using impl::bitwise_xor_strided_dispatch_table;
+
+        auto bitwise_xor_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, bitwise_xor_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                bitwise_xor_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                bitwise_xor_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto bitwise_xor_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               bitwise_xor_output_id_table);
+        };
+        m.def("_bitwise_xor", bitwise_xor_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_bitwise_xor_result_type", bitwise_xor_result_type_pyapi, "");
+
+        using impl::bitwise_xor_inplace_contig_dispatch_table;
+        using impl::bitwise_xor_inplace_output_id_table;
+        using impl::bitwise_xor_inplace_strided_dispatch_table;
+
+        auto bitwise_xor_inplace_pyapi = [&](const arrayT &src,
+                                             const arrayT &dst,
+                                             sycl::queue &exec_q,
+                                             const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, bitwise_xor_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                bitwise_xor_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                bitwise_xor_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_bitwise_xor_inplace", bitwise_xor_inplace_pyapi, "",
+              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp
new file mode 100644
index 000000000000..4029574cdd7d
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_bitwise_xor(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.cpp
new file mode 100644
index 000000000000..a061235acfd7
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "cbrt.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/cbrt.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U37: ==== CBRT   (x)
+namespace impl
+{
+
+namespace cbrt_fn_ns = dpctl::tensor::kernels::cbrt;
+
+static unary_contig_impl_fn_ptr_t cbrt_contig_dispatch_vector[td_ns::num_types];
+static int cbrt_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    cbrt_strided_dispatch_vector[td_ns::num_types];
+
+void populate_cbrt_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = cbrt_fn_ns;
+
+    using fn_ns::CbrtContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CbrtContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cbrt_contig_dispatch_vector);
+
+    using fn_ns::CbrtStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CbrtStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cbrt_strided_dispatch_vector);
+
+    using fn_ns::CbrtTypeMapFactory;
+    DispatchVectorBuilder<int, CbrtTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(cbrt_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_cbrt(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_cbrt_dispatch_vectors();
+        using impl::cbrt_contig_dispatch_vector;
+        using impl::cbrt_output_typeid_vector;
+        using impl::cbrt_strided_dispatch_vector;
+
+        auto cbrt_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, cbrt_output_typeid_vector,
+                cbrt_contig_dispatch_vector, cbrt_strided_dispatch_vector);
+        };
+        m.def("_cbrt", cbrt_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto cbrt_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, cbrt_output_typeid_vector);
+        };
+        m.def("_cbrt_result_type", cbrt_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.hpp
new file mode 100644
index 000000000000..53757bff7134
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/cbrt.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cbrt(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/ceil.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/ceil.cpp
new file mode 100644
index 000000000000..4c4604e31692
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/ceil.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "ceil.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/ceil.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U09: ==== CEIL   (x)
+namespace impl
+{
+
+namespace ceil_fn_ns = dpctl::tensor::kernels::ceil;
+
+static unary_contig_impl_fn_ptr_t ceil_contig_dispatch_vector[td_ns::num_types];
+static int ceil_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    ceil_strided_dispatch_vector[td_ns::num_types];
+
+void populate_ceil_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = ceil_fn_ns;
+
+    using fn_ns::CeilContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CeilContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(ceil_contig_dispatch_vector);
+
+    using fn_ns::CeilStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CeilStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(ceil_strided_dispatch_vector);
+
+    using fn_ns::CeilTypeMapFactory;
+    DispatchVectorBuilder<int, CeilTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(ceil_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_ceil(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_ceil_dispatch_vectors();
+        using impl::ceil_contig_dispatch_vector;
+        using impl::ceil_output_typeid_vector;
+        using impl::ceil_strided_dispatch_vector;
+
+        auto ceil_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, ceil_output_typeid_vector,
+                ceil_contig_dispatch_vector, ceil_strided_dispatch_vector);
+        };
+        m.def("_ceil", ceil_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto ceil_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, ceil_output_typeid_vector);
+        };
+        m.def("_ceil_result_type", ceil_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/ceil.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/ceil.hpp
new file mode 100644
index 000000000000..436cb5f89b2b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/ceil.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_ceil(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/conj.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/conj.cpp
new file mode 100644
index 000000000000..cee977f719f4
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/conj.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "conj.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/conj.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U10: ==== CONJ   (x)
+namespace impl
+{
+
+namespace conj_fn_ns = dpctl::tensor::kernels::conj;
+
+static unary_contig_impl_fn_ptr_t conj_contig_dispatch_vector[td_ns::num_types];
+static int conj_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    conj_strided_dispatch_vector[td_ns::num_types];
+
+void populate_conj_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = conj_fn_ns;
+
+    using fn_ns::ConjContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ConjContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(conj_contig_dispatch_vector);
+
+    using fn_ns::ConjStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ConjStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(conj_strided_dispatch_vector);
+
+    using fn_ns::ConjTypeMapFactory;
+    DispatchVectorBuilder<int, ConjTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(conj_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_conj(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_conj_dispatch_vectors();
+        using impl::conj_contig_dispatch_vector;
+        using impl::conj_output_typeid_vector;
+        using impl::conj_strided_dispatch_vector;
+
+        auto conj_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, conj_output_typeid_vector,
+                conj_contig_dispatch_vector, conj_strided_dispatch_vector);
+        };
+        m.def("_conj", conj_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto conj_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, conj_output_typeid_vector);
+        };
+        m.def("_conj_result_type", conj_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/conj.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/conj.hpp
new file mode 100644
index 000000000000..4c0aeb17260b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/conj.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_conj(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/copysign.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/copysign.cpp
new file mode 100644
index 000000000000..8dca1635459a
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/copysign.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "copysign.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/copysign.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B25: ===== COPYSIGN (x1, x2)
+namespace impl
+{
+namespace copysign_fn_ns = dpctl::tensor::kernels::copysign;
+
+static binary_contig_impl_fn_ptr_t
+    copysign_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int copysign_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    copysign_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_copysign_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = copysign_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::CopysignTypeMapFactory;
+    DispatchTableBuilder<int, CopysignTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(copysign_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::CopysignStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, CopysignStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(copysign_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::CopysignContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, CopysignContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(copysign_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_copysign(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_copysign_dispatch_tables();
+        using impl::copysign_contig_dispatch_table;
+        using impl::copysign_output_id_table;
+        using impl::copysign_strided_dispatch_table;
+
+        auto copysign_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, copysign_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                copysign_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                copysign_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto copysign_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               copysign_output_id_table);
+        };
+        m.def("_copysign", copysign_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_copysign_result_type", copysign_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/copysign.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/copysign.hpp
new file mode 100644
index 000000000000..875443d792c2
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/copysign.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_copysign(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/cos.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/cos.cpp
new file mode 100644
index 000000000000..966364c8b8c0
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/cos.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "cos.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/cos.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U11: ==== COS   (x)
+namespace impl
+{
+
+namespace cos_fn_ns = dpctl::tensor::kernels::cos;
+
+static unary_contig_impl_fn_ptr_t cos_contig_dispatch_vector[td_ns::num_types];
+static int cos_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    cos_strided_dispatch_vector[td_ns::num_types];
+
+void populate_cos_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = cos_fn_ns;
+
+    using fn_ns::CosContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CosContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cos_contig_dispatch_vector);
+
+    using fn_ns::CosStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CosStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cos_strided_dispatch_vector);
+
+    using fn_ns::CosTypeMapFactory;
+    DispatchVectorBuilder<int, CosTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(cos_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_cos(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_cos_dispatch_vectors();
+        using impl::cos_contig_dispatch_vector;
+        using impl::cos_output_typeid_vector;
+        using impl::cos_strided_dispatch_vector;
+
+        auto cos_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, cos_output_typeid_vector,
+                cos_contig_dispatch_vector, cos_strided_dispatch_vector);
+        };
+        m.def("_cos", cos_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto cos_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, cos_output_typeid_vector);
+        };
+        m.def("_cos_result_type", cos_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/cos.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/cos.hpp
new file mode 100644
index 000000000000..4b9ab341a355
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/cos.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cos(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/cosh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/cosh.cpp
new file mode 100644
index 000000000000..54fc5d57e4df
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/cosh.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "cosh.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/cosh.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U12: ==== COSH   (x)
+namespace impl
+{
+
+namespace cosh_fn_ns = dpctl::tensor::kernels::cosh;
+
+static unary_contig_impl_fn_ptr_t cosh_contig_dispatch_vector[td_ns::num_types];
+static int cosh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    cosh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_cosh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = cosh_fn_ns;
+
+    using fn_ns::CoshContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CoshContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cosh_contig_dispatch_vector);
+
+    using fn_ns::CoshStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CoshStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cosh_strided_dispatch_vector);
+
+    using fn_ns::CoshTypeMapFactory;
+    DispatchVectorBuilder<int, CoshTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(cosh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_cosh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_cosh_dispatch_vectors();
+        using impl::cosh_contig_dispatch_vector;
+        using impl::cosh_output_typeid_vector;
+        using impl::cosh_strided_dispatch_vector;
+
+        auto cosh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, cosh_output_typeid_vector,
+                cosh_contig_dispatch_vector, cosh_strided_dispatch_vector);
+        };
+        m.def("_cosh", cosh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto cosh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, cosh_output_typeid_vector);
+        };
+        m.def("_cosh_result_type", cosh_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/cosh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/cosh.hpp
new file mode 100644
index 000000000000..6ddfe5643b54
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/cosh.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_cosh(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
new file mode 100644
index 000000000000..dc09318d66ad
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
@@ -0,0 +1,191 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "abs.hpp"
+#include "acos.hpp"
+#include "acosh.hpp"
+#include "add.hpp"
+#include "angle.hpp"
+#include "asin.hpp"
+#include "asinh.hpp"
+#include "atan.hpp"
+#include "atan2.hpp"
+#include "atanh.hpp"
+#include "bitwise_and.hpp"
+#include "bitwise_invert.hpp"
+#include "bitwise_left_shift.hpp"
+#include "bitwise_or.hpp"
+#include "bitwise_right_shift.hpp"
+#include "bitwise_xor.hpp"
+#include "cbrt.hpp"
+#include "ceil.hpp"
+#include "conj.hpp"
+#include "copysign.hpp"
+#include "cos.hpp"
+#include "cosh.hpp"
+#include "equal.hpp"
+#include "exp.hpp"
+#include "exp2.hpp"
+#include "expm1.hpp"
+#include "floor.hpp"
+#include "floor_divide.hpp"
+#include "greater.hpp"
+#include "greater_equal.hpp"
+#include "hypot.hpp"
+#include "imag.hpp"
+#include "isfinite.hpp"
+#include "isinf.hpp"
+#include "isnan.hpp"
+#include "less.hpp"
+#include "less_equal.hpp"
+#include "log.hpp"
+#include "log10.hpp"
+#include "log1p.hpp"
+#include "log2.hpp"
+#include "logaddexp.hpp"
+#include "logical_and.hpp"
+#include "logical_not.hpp"
+#include "logical_or.hpp"
+#include "logical_xor.hpp"
+#include "maximum.hpp"
+#include "minimum.hpp"
+#include "multiply.hpp"
+#include "negative.hpp"
+#include "nextafter.hpp"
+#include "not_equal.hpp"
+#include "positive.hpp"
+#include "pow.hpp"
+#include "proj.hpp"
+#include "real.hpp"
+#include "reciprocal.hpp"
+#include "remainder.hpp"
+#include "round.hpp"
+#include "rsqrt.hpp"
+#include "sign.hpp"
+#include "signbit.hpp"
+#include "sin.hpp"
+#include "sinh.hpp"
+#include "sqrt.hpp"
+#include "square.hpp"
+#include "subtract.hpp"
+#include "tan.hpp"
+#include "tanh.hpp"
+#include "true_divide.hpp"
+#include "trunc.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+
+/*! @brief Add elementwise functions to Python module */
+void init_elementwise_functions(py::module_ m)
+{
+    init_abs(m);
+    init_acos(m);
+    init_acosh(m);
+    init_add(m);
+    init_angle(m);
+    init_asin(m);
+    init_asinh(m);
+    init_atan(m);
+    init_atan2(m);
+    init_atanh(m);
+    init_bitwise_and(m);
+    init_bitwise_invert(m);
+    init_bitwise_left_shift(m);
+    init_bitwise_or(m);
+    init_bitwise_right_shift(m);
+    init_bitwise_xor(m);
+    init_cbrt(m);
+    init_ceil(m);
+    init_conj(m);
+    init_copysign(m);
+    init_cos(m);
+    init_cosh(m);
+    init_divide(m);
+    init_equal(m);
+    init_exp(m);
+    init_exp2(m);
+    init_expm1(m);
+    init_floor(m);
+    init_floor_divide(m);
+    init_greater(m);
+    init_greater_equal(m);
+    init_hypot(m);
+    init_imag(m);
+    init_isfinite(m);
+    init_isinf(m);
+    init_isnan(m);
+    init_less(m);
+    init_less_equal(m);
+    init_log(m);
+    init_log10(m);
+    init_log1p(m);
+    init_log2(m);
+    init_logaddexp(m);
+    init_logical_and(m);
+    init_logical_not(m);
+    init_logical_or(m);
+    init_logical_xor(m);
+    init_maximum(m);
+    init_minimum(m);
+    init_multiply(m);
+    init_nextafter(m);
+    init_negative(m);
+    init_not_equal(m);
+    init_positive(m);
+    init_pow(m);
+    init_proj(m);
+    init_real(m);
+    init_reciprocal(m);
+    init_remainder(m);
+    init_round(m);
+    init_rsqrt(m);
+    init_sign(m);
+    init_signbit(m);
+    init_sin(m);
+    init_sinh(m);
+    init_sqrt(m);
+    init_square(m);
+    init_subtract(m);
+    init_tan(m);
+    init_tanh(m);
+    init_trunc(m);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp
new file mode 100644
index 000000000000..0c385f2d15a5
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_elementwise_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
new file mode 100644
index 000000000000..3a8dc6bfb56f
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
@@ -0,0 +1,807 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "elementwise_functions_type_utils.hpp"
+#include "kernels/alignment.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+static_assert(std::is_same_v<py::ssize_t, dpctl::tensor::ssize_t>);
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::alignment_utils::is_aligned;
+using dpctl::tensor::kernels::alignment_utils::required_alignment;
+
+/*! @brief Template implementing Python API for unary elementwise functions */
+template <typename output_typesT,
+          typename contig_dispatchT,
+          typename strided_dispatchT>
+std::pair<sycl::event, sycl::event>
+    py_unary_ufunc(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &q,
+                   const std::vector<sycl::event> &depends,
+                   //
+                   const output_typesT &output_type_vec,
+                   const contig_dispatchT &contig_dispatch_vector,
+                   const strided_dispatchT &strided_dispatch_vector)
+{
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    int func_output_typeid = output_type_vec[src_typeid];
+
+    // check that types are supported
+    if (dst_typeid != func_output_typeid) {
+        throw py::value_error(
+            "Destination array has unexpected elemental data type.");
+    }
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check that dimensions are the same
+    int src_nd = src.get_ndim();
+    if (src_nd != dst.get_ndim()) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // check that shapes are the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; i < src_nd; ++i) {
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    // if nelems is zero, return
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check memory overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if (overlap(src, dst) && !same_logical_tensors(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    // handle contiguous inputs
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    bool both_c_contig = (is_src_c_contig && is_dst_c_contig);
+    bool both_f_contig = (is_src_f_contig && is_dst_f_contig);
+
+    if (both_c_contig || both_f_contig) {
+        auto contig_fn = contig_dispatch_vector[src_typeid];
+
+        if (contig_fn == nullptr) {
+            throw std::runtime_error(
+                "Contiguous implementation is missing for src_typeid=" +
+                std::to_string(src_typeid));
+        }
+
+        auto comp_ev = contig_fn(q, src_nelems, src_data, dst_data, depends);
+        sycl::event ht_ev =
+            dpctl::utils::keep_args_alive(q, {src, dst}, {comp_ev});
+
+        return std::make_pair(ht_ev, comp_ev);
+    }
+
+    // simplify iteration space
+    //     if 1d with strides 1 - input is contig
+    //     dispatch to strided
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+    const py::ssize_t *shape = src_shape;
+
+    simplify_iteration_space(nd, shape, src_strides, dst_strides,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (nd == 1 && simplified_src_strides[0] == 1 &&
+        simplified_dst_strides[0] == 1) {
+        // Special case of contiguous data
+        auto contig_fn = contig_dispatch_vector[src_typeid];
+
+        if (contig_fn == nullptr) {
+            throw std::runtime_error(
+                "Contiguous implementation is missing for src_typeid=" +
+                std::to_string(src_typeid));
+        }
+
+        int src_elem_size = src.get_elemsize();
+        int dst_elem_size = dst.get_elemsize();
+        auto comp_ev =
+            contig_fn(q, src_nelems, src_data + src_elem_size * src_offset,
+                      dst_data + dst_elem_size * dst_offset, depends);
+
+        sycl::event ht_ev =
+            dpctl::utils::keep_args_alive(q, {src, dst}, {comp_ev});
+
+        return std::make_pair(ht_ev, comp_ev);
+    }
+
+    // Strided implementation
+    auto strided_fn = strided_dispatch_vector[src_typeid];
+
+    if (strided_fn == nullptr) {
+        throw std::runtime_error(
+            "Strided implementation is missing for src_typeid=" +
+            std::to_string(src_typeid));
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    std::vector<sycl::event> host_tasks{};
+    host_tasks.reserve(2);
+
+    auto ptr_size_event_triple_ = device_allocate_and_pack<py::ssize_t>(
+        q, host_tasks, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_triple_));
+    const auto &copy_shape_ev = std::get<2>(ptr_size_event_triple_);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    sycl::event strided_fn_ev =
+        strided_fn(q, src_nelems, nd, shape_strides, src_data, src_offset,
+                   dst_data, dst_offset, depends, {copy_shape_ev});
+
+    // async free of shape_strides temporary
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        q, {strided_fn_ev}, shape_strides_owner);
+
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(q, {src, dst}, host_tasks),
+        strided_fn_ev);
+}
+
+/*! @brief Template implementing Python API for querying of type support by
+ *         unary elementwise functions */
+template <typename output_typesT>
+py::object py_unary_ufunc_result_type(const py::dtype &input_dtype,
+                                      const output_typesT &output_types)
+{
+    int tn = input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int src_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        src_typeid = array_types.typenum_to_lookup_id(tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    using type_utils::_result_typeid;
+    int dst_typeid = _result_typeid(src_typeid, output_types);
+
+    if (dst_typeid < 0) {
+        auto res = py::none();
+        return py::cast<py::object>(res);
+    }
+    else {
+        using type_utils::_dtype_from_typenum;
+
+        auto dst_typenum_t = static_cast<td_ns::typenum_t>(dst_typeid);
+        auto dt = _dtype_from_typenum(dst_typenum_t);
+
+        return py::cast<py::object>(dt);
+    }
+}
+
+// ======================== Binary functions ===========================
+
+namespace
+{
+template <class Container, class T>
+bool isEqual(Container const &c, std::initializer_list<T> const &l)
+{
+    return std::equal(std::begin(c), std::end(c), std::begin(l), std::end(l));
+}
+} // namespace
+
+/*! @brief Template implementing Python API for binary elementwise
+ *         functions */
+template <typename output_typesT,
+          typename contig_dispatchT,
+          typename strided_dispatchT,
+          typename contig_matrix_row_dispatchT,
+          typename contig_row_matrix_dispatchT>
+std::pair<sycl::event, sycl::event> py_binary_ufunc(
+    const dpctl::tensor::usm_ndarray &src1,
+    const dpctl::tensor::usm_ndarray &src2,
+    const dpctl::tensor::usm_ndarray &dst, // dst = op(src1, src2), elementwise
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> depends,
+    //
+    const output_typesT &output_type_table,
+    const contig_dispatchT &contig_dispatch_table,
+    const strided_dispatchT &strided_dispatch_table,
+    const contig_matrix_row_dispatchT
+        &contig_matrix_row_broadcast_dispatch_table,
+    const contig_row_matrix_dispatchT
+        &contig_row_matrix_broadcast_dispatch_table)
+{
+    // check type_nums
+    int src1_typenum = src1.get_typenum();
+    int src2_typenum = src2.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int src1_typeid = array_types.typenum_to_lookup_id(src1_typenum);
+    int src2_typeid = array_types.typenum_to_lookup_id(src2_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    int output_typeid = output_type_table[src1_typeid][src2_typeid];
+
+    if (output_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array has unexpected elemental data type.");
+    }
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src1, src2, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check shapes, broadcasting is assumed done by caller
+    // check that dimensions are the same
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != src1.get_ndim() || dst_nd != src2.get_ndim()) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // check that shapes are the same
+    const py::ssize_t *src1_shape = src1.get_shape_raw();
+    const py::ssize_t *src2_shape = src2.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; i < dst_nd; ++i) {
+        src_nelems *= static_cast<std::size_t>(src1_shape[i]);
+        shapes_equal = shapes_equal && (src1_shape[i] == dst_shape[i] &&
+                                        src2_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    // if nelems is zero, return
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(src1, dst) && !same_logical_tensors(src1, dst)) ||
+        (overlap(src2, dst) && !same_logical_tensors(src2, dst))) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+    // check memory overlap
+    const char *src1_data = src1.get_data();
+    const char *src2_data = src2.get_data();
+    char *dst_data = dst.get_data();
+
+    // handle contiguous inputs
+    bool is_src1_c_contig = src1.is_c_contiguous();
+    bool is_src1_f_contig = src1.is_f_contiguous();
+
+    bool is_src2_c_contig = src2.is_c_contiguous();
+    bool is_src2_f_contig = src2.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    bool all_c_contig =
+        (is_src1_c_contig && is_src2_c_contig && is_dst_c_contig);
+    bool all_f_contig =
+        (is_src1_f_contig && is_src2_f_contig && is_dst_f_contig);
+
+    // dispatch for contiguous inputs
+    if (all_c_contig || all_f_contig) {
+        auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
+
+        if (contig_fn != nullptr) {
+            auto comp_ev = contig_fn(exec_q, src_nelems, src1_data, 0,
+                                     src2_data, 0, dst_data, 0, depends);
+            sycl::event ht_ev = dpctl::utils::keep_args_alive(
+                exec_q, {src1, src2, dst}, {comp_ev});
+
+            return std::make_pair(ht_ev, comp_ev);
+        }
+    }
+
+    // simplify strides
+    auto const &src1_strides = src1.get_strides_vector();
+    auto const &src2_strides = src2.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src1_strides;
+    shT simplified_src2_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src1_offset(0);
+    py::ssize_t src2_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = dst_nd;
+    const py::ssize_t *shape = src1_shape;
+
+    simplify_iteration_space_3(
+        nd, shape, src1_strides, src2_strides, dst_strides,
+        // outputs
+        simplified_shape, simplified_src1_strides, simplified_src2_strides,
+        simplified_dst_strides, src1_offset, src2_offset, dst_offset);
+
+    std::vector<sycl::event> host_tasks{};
+    if (nd < 3) {
+        static constexpr auto unit_stride =
+            std::initializer_list<py::ssize_t>{1};
+
+        if ((nd == 1) && isEqual(simplified_src1_strides, unit_stride) &&
+            isEqual(simplified_src2_strides, unit_stride) &&
+            isEqual(simplified_dst_strides, unit_stride)) {
+            auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
+
+            if (contig_fn != nullptr) {
+                auto comp_ev = contig_fn(exec_q, src_nelems, src1_data,
+                                         src1_offset, src2_data, src2_offset,
+                                         dst_data, dst_offset, depends);
+                sycl::event ht_ev = dpctl::utils::keep_args_alive(
+                    exec_q, {src1, src2, dst}, {comp_ev});
+
+                return std::make_pair(ht_ev, comp_ev);
+            }
+        }
+        if (nd == 2) {
+            static constexpr auto zero_one_strides =
+                std::initializer_list<py::ssize_t>{0, 1};
+            static constexpr auto one_zero_strides =
+                std::initializer_list<py::ssize_t>{1, 0};
+            static constexpr py::ssize_t one{1};
+            // special case of C-contiguous matrix and a row
+            if (isEqual(simplified_src2_strides, zero_one_strides) &&
+                isEqual(simplified_src1_strides, {simplified_shape[1], one}) &&
+                isEqual(simplified_dst_strides, {simplified_shape[1], one})) {
+                auto matrix_row_broadcast_fn =
+                    contig_matrix_row_broadcast_dispatch_table[src1_typeid]
+                                                              [src2_typeid];
+                if (matrix_row_broadcast_fn != nullptr) {
+                    int src1_itemsize = src1.get_elemsize();
+                    int src2_itemsize = src2.get_elemsize();
+                    int dst_itemsize = dst.get_elemsize();
+
+                    if (is_aligned<required_alignment>(
+                            src1_data + src1_offset * src1_itemsize) &&
+                        is_aligned<required_alignment>(
+                            src2_data + src2_offset * src2_itemsize) &&
+                        is_aligned<required_alignment>(
+                            dst_data + dst_offset * dst_itemsize)) {
+                        std::size_t n0 = simplified_shape[0];
+                        std::size_t n1 = simplified_shape[1];
+                        sycl::event comp_ev = matrix_row_broadcast_fn(
+                            exec_q, host_tasks, n0, n1, src1_data, src1_offset,
+                            src2_data, src2_offset, dst_data, dst_offset,
+                            depends);
+
+                        return std::make_pair(
+                            dpctl::utils::keep_args_alive(
+                                exec_q, {src1, src2, dst}, host_tasks),
+                            comp_ev);
+                    }
+                }
+            }
+            if (isEqual(simplified_src1_strides, one_zero_strides) &&
+                isEqual(simplified_src2_strides, {one, simplified_shape[0]}) &&
+                isEqual(simplified_dst_strides, {one, simplified_shape[0]})) {
+                auto row_matrix_broadcast_fn =
+                    contig_row_matrix_broadcast_dispatch_table[src1_typeid]
+                                                              [src2_typeid];
+                if (row_matrix_broadcast_fn != nullptr) {
+
+                    int src1_itemsize = src1.get_elemsize();
+                    int src2_itemsize = src2.get_elemsize();
+                    int dst_itemsize = dst.get_elemsize();
+
+                    if (is_aligned<required_alignment>(
+                            src1_data + src1_offset * src1_itemsize) &&
+                        is_aligned<required_alignment>(
+                            src2_data + src2_offset * src2_itemsize) &&
+                        is_aligned<required_alignment>(
+                            dst_data + dst_offset * dst_itemsize)) {
+                        std::size_t n0 = simplified_shape[1];
+                        std::size_t n1 = simplified_shape[0];
+                        sycl::event comp_ev = row_matrix_broadcast_fn(
+                            exec_q, host_tasks, n0, n1, src1_data, src1_offset,
+                            src2_data, src2_offset, dst_data, dst_offset,
+                            depends);
+
+                        return std::make_pair(
+                            dpctl::utils::keep_args_alive(
+                                exec_q, {src1, src2, dst}, host_tasks),
+                            comp_ev);
+                    }
+                }
+            }
+        }
+    }
+
+    // dispatch to strided code
+    auto strided_fn = strided_dispatch_table[src1_typeid][src2_typeid];
+
+    if (strided_fn == nullptr) {
+        throw std::runtime_error(
+            "Strided implementation is missing for src1_typeid=" +
+            std::to_string(src1_typeid) +
+            " and src2_typeid=" + std::to_string(src2_typeid));
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_tasks, simplified_shape, simplified_src1_strides,
+        simplified_src2_strides, simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_));
+    auto &copy_shape_ev = std::get<2>(ptr_sz_event_triple_);
+
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    sycl::event strided_fn_ev = strided_fn(
+        exec_q, src_nelems, nd, shape_strides, src1_data, src1_offset,
+        src2_data, src2_offset, dst_data, dst_offset, depends, {copy_shape_ev});
+
+    // async free of shape_strides temporary
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {strided_fn_ev}, shape_strides_owner);
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, host_tasks),
+        strided_fn_ev);
+}
+
+/*! @brief Type querying for binary elementwise functions */
+template <typename output_typesT>
+py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype,
+                                       const py::dtype &input2_dtype,
+                                       const output_typesT &output_types_table)
+{
+    int tn1 = input1_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int tn2 = input2_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int src1_typeid = -1;
+    int src2_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        src1_typeid = array_types.typenum_to_lookup_id(tn1);
+        src2_typeid = array_types.typenum_to_lookup_id(tn2);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
+        src2_typeid >= td_ns::num_types) {
+        throw std::runtime_error("binary output type lookup failed");
+    }
+    int dst_typeid = output_types_table[src1_typeid][src2_typeid];
+
+    if (dst_typeid < 0) {
+        auto res = py::none();
+        return py::cast<py::object>(res);
+    }
+    else {
+        using type_utils::_dtype_from_typenum;
+
+        auto dst_typenum_t = static_cast<td_ns::typenum_t>(dst_typeid);
+        auto dt = _dtype_from_typenum(dst_typenum_t);
+
+        return py::cast<py::object>(dt);
+    }
+}
+
+// ==================== Inplace binary functions =======================
+
+template <typename output_typesT,
+          typename contig_dispatchT,
+          typename strided_dispatchT,
+          typename contig_row_matrix_dispatchT>
+std::pair<sycl::event, sycl::event>
+    py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs,
+                            const dpctl::tensor::usm_ndarray &rhs,
+                            sycl::queue &exec_q,
+                            const std::vector<sycl::event> depends,
+                            //
+                            const output_typesT &output_type_table,
+                            const contig_dispatchT &contig_dispatch_table,
+                            const strided_dispatchT &strided_dispatch_table,
+                            const contig_row_matrix_dispatchT
+                                &contig_row_matrix_broadcast_dispatch_table)
+{
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(lhs);
+
+    // check type_nums
+    int rhs_typenum = rhs.get_typenum();
+    int lhs_typenum = lhs.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum);
+    int lhs_typeid = array_types.typenum_to_lookup_id(lhs_typenum);
+
+    int output_typeid = output_type_table[rhs_typeid][lhs_typeid];
+
+    if (output_typeid != lhs_typeid) {
+        throw py::value_error(
+            "Left-hand side array has unexpected elemental data type.");
+    }
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {rhs, lhs})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // check shapes, broadcasting is assumed done by caller
+    // check that dimensions are the same
+    int lhs_nd = lhs.get_ndim();
+    if (lhs_nd != rhs.get_ndim()) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // check that shapes are the same
+    const py::ssize_t *rhs_shape = rhs.get_shape_raw();
+    const py::ssize_t *lhs_shape = lhs.get_shape_raw();
+    bool shapes_equal(true);
+    std::size_t rhs_nelems(1);
+
+    for (int i = 0; i < lhs_nd; ++i) {
+        rhs_nelems *= static_cast<std::size_t>(rhs_shape[i]);
+        shapes_equal = shapes_equal && (rhs_shape[i] == lhs_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    // if nelems is zero, return
+    if (rhs_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(lhs, rhs_nelems);
+
+    // check memory overlap
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(rhs, lhs) && !same_logical_tensors(rhs, lhs)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+    // check memory overlap
+    const char *rhs_data = rhs.get_data();
+    char *lhs_data = lhs.get_data();
+
+    // handle contiguous inputs
+    bool is_rhs_c_contig = rhs.is_c_contiguous();
+    bool is_rhs_f_contig = rhs.is_f_contiguous();
+
+    bool is_lhs_c_contig = lhs.is_c_contiguous();
+    bool is_lhs_f_contig = lhs.is_f_contiguous();
+
+    bool both_c_contig = (is_rhs_c_contig && is_lhs_c_contig);
+    bool both_f_contig = (is_rhs_f_contig && is_lhs_f_contig);
+
+    // dispatch for contiguous inputs
+    if (both_c_contig || both_f_contig) {
+        auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid];
+
+        if (contig_fn != nullptr) {
+            auto comp_ev = contig_fn(exec_q, rhs_nelems, rhs_data, 0, lhs_data,
+                                     0, depends);
+            sycl::event ht_ev =
+                dpctl::utils::keep_args_alive(exec_q, {rhs, lhs}, {comp_ev});
+
+            return std::make_pair(ht_ev, comp_ev);
+        }
+    }
+
+    // simplify strides
+    auto const &rhs_strides = rhs.get_strides_vector();
+    auto const &lhs_strides = lhs.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_rhs_strides;
+    shT simplified_lhs_strides;
+    py::ssize_t rhs_offset(0);
+    py::ssize_t lhs_offset(0);
+
+    int nd = lhs_nd;
+    const py::ssize_t *shape = rhs_shape;
+
+    simplify_iteration_space(nd, shape, rhs_strides, lhs_strides,
+                             // outputs
+                             simplified_shape, simplified_rhs_strides,
+                             simplified_lhs_strides, rhs_offset, lhs_offset);
+
+    std::vector<sycl::event> host_tasks{};
+    if (nd < 3) {
+        static constexpr auto unit_stride =
+            std::initializer_list<py::ssize_t>{1};
+
+        if ((nd == 1) && isEqual(simplified_rhs_strides, unit_stride) &&
+            isEqual(simplified_lhs_strides, unit_stride)) {
+            auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid];
+
+            if (contig_fn != nullptr) {
+                auto comp_ev =
+                    contig_fn(exec_q, rhs_nelems, rhs_data, rhs_offset,
+                              lhs_data, lhs_offset, depends);
+                sycl::event ht_ev = dpctl::utils::keep_args_alive(
+                    exec_q, {rhs, lhs}, {comp_ev});
+
+                return std::make_pair(ht_ev, comp_ev);
+            }
+        }
+        if (nd == 2) {
+            static constexpr auto one_zero_strides =
+                std::initializer_list<py::ssize_t>{1, 0};
+            static constexpr py::ssize_t one{1};
+            // special case of C-contiguous matrix and a row
+            if (isEqual(simplified_rhs_strides, one_zero_strides) &&
+                isEqual(simplified_lhs_strides, {one, simplified_shape[0]})) {
+                auto row_matrix_broadcast_fn =
+                    contig_row_matrix_broadcast_dispatch_table[rhs_typeid]
+                                                              [lhs_typeid];
+                if (row_matrix_broadcast_fn != nullptr) {
+                    std::size_t n0 = simplified_shape[1];
+                    std::size_t n1 = simplified_shape[0];
+                    sycl::event comp_ev = row_matrix_broadcast_fn(
+                        exec_q, host_tasks, n0, n1, rhs_data, rhs_offset,
+                        lhs_data, lhs_offset, depends);
+
+                    return std::make_pair(dpctl::utils::keep_args_alive(
+                                              exec_q, {lhs, rhs}, host_tasks),
+                                          comp_ev);
+                }
+            }
+        }
+    }
+
+    // dispatch to strided code
+    auto strided_fn = strided_dispatch_table[rhs_typeid][lhs_typeid];
+
+    if (strided_fn == nullptr) {
+        throw std::runtime_error(
+            "Strided implementation is missing for rhs_typeid=" +
+            std::to_string(rhs_typeid) +
+            " and lhs_typeid=" + std::to_string(lhs_typeid));
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_tasks, simplified_shape, simplified_rhs_strides,
+        simplified_lhs_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_));
+    auto copy_shape_ev = std::get<2>(ptr_sz_event_triple_);
+
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    sycl::event strided_fn_ev =
+        strided_fn(exec_q, rhs_nelems, nd, shape_strides, rhs_data, rhs_offset,
+                   lhs_data, lhs_offset, depends, {copy_shape_ev});
+
+    // async free of shape_strides temporary
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {strided_fn_ev}, shape_strides_owner);
+
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {rhs, lhs}, host_tasks),
+        strided_fn_ev);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
new file mode 100644
index 000000000000..7d327ada7349
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
@@ -0,0 +1,96 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions for looking of supported types in elementwise
+/// functions.
+//===---------------------------------------------------------------------===//
+
+#include <string>
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "elementwise_functions_type_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal::type_utils
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t)
+{
+    switch (dst_typenum_t) {
+    case td_ns::typenum_t::BOOL:
+        return py::dtype("?");
+    case td_ns::typenum_t::INT8:
+        return py::dtype("i1");
+    case td_ns::typenum_t::UINT8:
+        return py::dtype("u1");
+    case td_ns::typenum_t::INT16:
+        return py::dtype("i2");
+    case td_ns::typenum_t::UINT16:
+        return py::dtype("u2");
+    case td_ns::typenum_t::INT32:
+        return py::dtype("i4");
+    case td_ns::typenum_t::UINT32:
+        return py::dtype("u4");
+    case td_ns::typenum_t::INT64:
+        return py::dtype("i8");
+    case td_ns::typenum_t::UINT64:
+        return py::dtype("u8");
+    case td_ns::typenum_t::HALF:
+        return py::dtype("f2");
+    case td_ns::typenum_t::FLOAT:
+        return py::dtype("f4");
+    case td_ns::typenum_t::DOUBLE:
+        return py::dtype("f8");
+    case td_ns::typenum_t::CFLOAT:
+        return py::dtype("c8");
+    case td_ns::typenum_t::CDOUBLE:
+        return py::dtype("c16");
+    default:
+        throw py::value_error("Unrecognized dst_typeid");
+    }
+}
+
+int _result_typeid(int arg_typeid, const int *fn_output_id)
+{
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types) {
+        throw py::value_error("Input typeid " + std::to_string(arg_typeid) +
+                              " is outside of expected bounds.");
+    }
+
+    return fn_output_id[arg_typeid];
+}
+
+} // namespace dpctl::tensor::py_internal::type_utils
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp
new file mode 100644
index 000000000000..d3324feb3470
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp
@@ -0,0 +1,56 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions for looking of supported types in elementwise
+/// functions.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal::type_utils
+{
+
+/*! @brief Produce dtype from a type number */
+extern py::dtype _dtype_from_typenum(td_ns::typenum_t);
+
+/*! @brief Lookup typeid of the result from typeid of
+ *         argument and the mapping table */
+extern int _result_typeid(int, const int *);
+
+} // namespace dpctl::tensor::py_internal::type_utils
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/equal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/equal.cpp
new file mode 100644
index 000000000000..863501bea367
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/equal.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "equal.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/equal.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B09: ===== EQUAL (x1, x2)
+namespace impl
+{
+namespace equal_fn_ns = dpctl::tensor::kernels::equal;
+
+static binary_contig_impl_fn_ptr_t
+    equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int equal_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_equal_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = equal_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::EqualTypeMapFactory;
+    DispatchTableBuilder<int, EqualTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(equal_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::EqualStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, EqualStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(equal_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::EqualContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, EqualContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(equal_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_equal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_equal_dispatch_tables();
+        using impl::equal_contig_dispatch_table;
+        using impl::equal_output_id_table;
+        using impl::equal_strided_dispatch_table;
+
+        auto equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                               const arrayT &dst, sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, equal_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                equal_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                equal_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto equal_result_type_pyapi = [&](const py::dtype &dtype1,
+                                           const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               equal_output_id_table);
+        };
+        m.def("_equal", equal_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_equal_result_type", equal_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/equal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/equal.hpp
new file mode 100644
index 000000000000..23f370111458
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/equal.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_equal(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/exp.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/exp.cpp
new file mode 100644
index 000000000000..cd3cd65107f7
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/exp.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "exp.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/exp.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U13: ==== EXP   (x)
+namespace impl
+{
+
+namespace exp_fn_ns = dpctl::tensor::kernels::exp;
+
+static unary_contig_impl_fn_ptr_t exp_contig_dispatch_vector[td_ns::num_types];
+static int exp_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    exp_strided_dispatch_vector[td_ns::num_types];
+
+void populate_exp_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = exp_fn_ns;
+
+    using fn_ns::ExpContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ExpContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(exp_contig_dispatch_vector);
+
+    using fn_ns::ExpStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ExpStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(exp_strided_dispatch_vector);
+
+    using fn_ns::ExpTypeMapFactory;
+    DispatchVectorBuilder<int, ExpTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(exp_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_exp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_exp_dispatch_vectors();
+        using impl::exp_contig_dispatch_vector;
+        using impl::exp_output_typeid_vector;
+        using impl::exp_strided_dispatch_vector;
+
+        auto exp_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, exp_output_typeid_vector,
+                exp_contig_dispatch_vector, exp_strided_dispatch_vector);
+        };
+        m.def("_exp", exp_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto exp_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, exp_output_typeid_vector);
+        };
+        m.def("_exp_result_type", exp_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/exp.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/exp.hpp
new file mode 100644
index 000000000000..14b757a18e92
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/exp.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_exp(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/exp2.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/exp2.cpp
new file mode 100644
index 000000000000..fc40a8e0aab9
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/exp2.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "exp2.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/exp2.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U38: ==== EXP2   (x)
+namespace impl
+{
+
+namespace exp2_fn_ns = dpctl::tensor::kernels::exp2;
+
+static unary_contig_impl_fn_ptr_t exp2_contig_dispatch_vector[td_ns::num_types];
+static int exp2_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    exp2_strided_dispatch_vector[td_ns::num_types];
+
+void populate_exp2_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = exp2_fn_ns;
+
+    using fn_ns::Exp2ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Exp2ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(exp2_contig_dispatch_vector);
+
+    using fn_ns::Exp2StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Exp2StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(exp2_strided_dispatch_vector);
+
+    using fn_ns::Exp2TypeMapFactory;
+    DispatchVectorBuilder<int, Exp2TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(exp2_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_exp2(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_exp2_dispatch_vectors();
+        using impl::exp2_contig_dispatch_vector;
+        using impl::exp2_output_typeid_vector;
+        using impl::exp2_strided_dispatch_vector;
+
+        auto exp2_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, exp2_output_typeid_vector,
+                exp2_contig_dispatch_vector, exp2_strided_dispatch_vector);
+        };
+        m.def("_exp2", exp2_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto exp2_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, exp2_output_typeid_vector);
+        };
+        m.def("_exp2_result_type", exp2_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/exp2.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/exp2.hpp
new file mode 100644
index 000000000000..f9f315d14383
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/exp2.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_exp2(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/expm1.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/expm1.cpp
new file mode 100644
index 000000000000..b4770b7b819c
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/expm1.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "expm1.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/expm1.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U14: ==== EXPM1   (x)
+namespace impl
+{
+
+namespace expm1_fn_ns = dpctl::tensor::kernels::expm1;
+
+static unary_contig_impl_fn_ptr_t
+    expm1_contig_dispatch_vector[td_ns::num_types];
+static int expm1_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    expm1_strided_dispatch_vector[td_ns::num_types];
+
+void populate_expm1_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = expm1_fn_ns;
+
+    using fn_ns::Expm1ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Expm1ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(expm1_contig_dispatch_vector);
+
+    using fn_ns::Expm1StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Expm1StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(expm1_strided_dispatch_vector);
+
+    using fn_ns::Expm1TypeMapFactory;
+    DispatchVectorBuilder<int, Expm1TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(expm1_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_expm1(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_expm1_dispatch_vectors();
+        using impl::expm1_contig_dispatch_vector;
+        using impl::expm1_output_typeid_vector;
+        using impl::expm1_strided_dispatch_vector;
+
+        auto expm1_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, expm1_output_typeid_vector,
+                expm1_contig_dispatch_vector, expm1_strided_dispatch_vector);
+        };
+        m.def("_expm1", expm1_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto expm1_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              expm1_output_typeid_vector);
+        };
+        m.def("_expm1_result_type", expm1_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/expm1.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/expm1.hpp
new file mode 100644
index 000000000000..4f373fe67dff
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/expm1.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_expm1(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/floor.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/floor.cpp
new file mode 100644
index 000000000000..2a81ce6552a9
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/floor.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "floor.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/floor.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U15: ==== FLOOR   (x)
+namespace impl
+{
+
+namespace floor_fn_ns = dpctl::tensor::kernels::floor;
+
+static unary_contig_impl_fn_ptr_t
+    floor_contig_dispatch_vector[td_ns::num_types];
+static int floor_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    floor_strided_dispatch_vector[td_ns::num_types];
+
+void populate_floor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = floor_fn_ns;
+
+    using fn_ns::FloorContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, FloorContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(floor_contig_dispatch_vector);
+
+    using fn_ns::FloorStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, FloorStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(floor_strided_dispatch_vector);
+
+    using fn_ns::FloorTypeMapFactory;
+    DispatchVectorBuilder<int, FloorTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(floor_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_floor(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_floor_dispatch_vectors();
+        using impl::floor_contig_dispatch_vector;
+        using impl::floor_output_typeid_vector;
+        using impl::floor_strided_dispatch_vector;
+
+        auto floor_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, floor_output_typeid_vector,
+                floor_contig_dispatch_vector, floor_strided_dispatch_vector);
+        };
+        m.def("_floor", floor_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto floor_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              floor_output_typeid_vector);
+        };
+        m.def("_floor_result_type", floor_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/floor.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/floor.hpp
new file mode 100644
index 000000000000..5e5fe41ce313
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/floor.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_floor(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.cpp
new file mode 100644
index 000000000000..af4635a0f500
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.cpp
@@ -0,0 +1,205 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "floor_divide.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/floor_divide.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B10: ===== FLOOR_DIVIDE (x1, x2)
+namespace impl
+{
+namespace floor_divide_fn_ns = dpctl::tensor::kernels::floor_divide;
+
+static binary_contig_impl_fn_ptr_t
+    floor_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int floor_divide_output_id_table[td_ns::num_types][td_ns::num_types];
+static int floor_divide_inplace_output_id_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    floor_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    floor_divide_inplace_contig_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    floor_divide_inplace_strided_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+void populate_floor_divide_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = floor_divide_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::FloorDivideTypeMapFactory;
+    DispatchTableBuilder<int, FloorDivideTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(floor_divide_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::FloorDivideStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         FloorDivideStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(floor_divide_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::FloorDivideContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, FloorDivideContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(floor_divide_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::FloorDivideInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         FloorDivideInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(floor_divide_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::FloorDivideInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         FloorDivideInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(floor_divide_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::FloorDivideInplaceTypeMapFactory;
+    DispatchTableBuilder<int, FloorDivideInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(floor_divide_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_floor_divide(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_floor_divide_dispatch_tables();
+        using impl::floor_divide_contig_dispatch_table;
+        using impl::floor_divide_output_id_table;
+        using impl::floor_divide_strided_dispatch_table;
+
+        auto floor_divide_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                      const arrayT &dst, sycl::queue &exec_q,
+                                      const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, floor_divide_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                floor_divide_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                floor_divide_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto floor_divide_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                  const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               floor_divide_output_id_table);
+        };
+        m.def("_floor_divide", floor_divide_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_floor_divide_result_type", floor_divide_result_type_pyapi, "");
+
+        using impl::floor_divide_inplace_contig_dispatch_table;
+        using impl::floor_divide_inplace_output_id_table;
+        using impl::floor_divide_inplace_strided_dispatch_table;
+
+        auto floor_divide_inplace_pyapi = [&](const arrayT &src,
+                                              const arrayT &dst,
+                                              sycl::queue &exec_q,
+                                              const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, floor_divide_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                floor_divide_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                floor_divide_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_floor_divide_inplace", floor_divide_inplace_pyapi, "",
+              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.hpp
new file mode 100644
index 000000000000..17d493b58057
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/floor_divide.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_floor_divide(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/greater.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/greater.cpp
new file mode 100644
index 000000000000..f3cfaeae2286
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/greater.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "greater.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/greater.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B11: ===== GREATER (x1, x2)
+namespace impl
+{
+namespace greater_fn_ns = dpctl::tensor::kernels::greater;
+
+static binary_contig_impl_fn_ptr_t
+    greater_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int greater_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    greater_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_greater_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = greater_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::GreaterTypeMapFactory;
+    DispatchTableBuilder<int, GreaterTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(greater_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::GreaterStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, GreaterStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(greater_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::GreaterContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, GreaterContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(greater_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_greater(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_greater_dispatch_tables();
+        using impl::greater_contig_dispatch_table;
+        using impl::greater_output_id_table;
+        using impl::greater_strided_dispatch_table;
+
+        auto greater_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                 const arrayT &dst, sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, greater_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                greater_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                greater_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto greater_result_type_pyapi = [&](const py::dtype &dtype1,
+                                             const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               greater_output_id_table);
+        };
+        m.def("_greater", greater_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_greater_result_type", greater_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/greater.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/greater.hpp
new file mode 100644
index 000000000000..c8c3caa5f1fd
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/greater.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_greater(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.cpp
new file mode 100644
index 000000000000..ad9af91ce3d8
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "greater_equal.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/greater_equal.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B12: ===== GREATER_EQUAL (x1, x2)
+namespace impl
+{
+namespace greater_equal_fn_ns = dpctl::tensor::kernels::greater_equal;
+
+static binary_contig_impl_fn_ptr_t
+    greater_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int greater_equal_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    greater_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_greater_equal_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = greater_equal_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::GreaterEqualTypeMapFactory;
+    DispatchTableBuilder<int, GreaterEqualTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(greater_equal_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::GreaterEqualStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
+                         GreaterEqualStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(greater_equal_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::GreaterEqualContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, GreaterEqualContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(greater_equal_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_greater_equal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_greater_equal_dispatch_tables();
+        using impl::greater_equal_contig_dispatch_table;
+        using impl::greater_equal_output_id_table;
+        using impl::greater_equal_strided_dispatch_table;
+
+        auto greater_equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                       const arrayT &dst, sycl::queue &exec_q,
+                                       const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, greater_equal_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                greater_equal_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                greater_equal_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto greater_equal_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                   const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               greater_equal_output_id_table);
+        };
+        m.def("_greater_equal", greater_equal_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_greater_equal_result_type", greater_equal_result_type_pyapi,
+              "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.hpp
new file mode 100644
index 000000000000..0cf7f8e89bbf
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/greater_equal.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_greater_equal(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/hypot.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/hypot.cpp
new file mode 100644
index 000000000000..f4ce161f4cda
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/hypot.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "hypot.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/hypot.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B24: ===== HYPOT (x1, x2)
+namespace impl
+{
+namespace hypot_fn_ns = dpctl::tensor::kernels::hypot;
+
+static binary_contig_impl_fn_ptr_t
+    hypot_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int hypot_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    hypot_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_hypot_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = hypot_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::HypotTypeMapFactory;
+    DispatchTableBuilder<int, HypotTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(hypot_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::HypotStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, HypotStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(hypot_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::HypotContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, HypotContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(hypot_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_hypot(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_hypot_dispatch_tables();
+        using impl::hypot_contig_dispatch_table;
+        using impl::hypot_output_id_table;
+        using impl::hypot_strided_dispatch_table;
+
+        auto hypot_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                               const arrayT &dst, sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, hypot_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                hypot_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                hypot_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto hypot_result_type_pyapi = [&](const py::dtype &dtype1,
+                                           const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               hypot_output_id_table);
+        };
+        m.def("_hypot", hypot_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_hypot_result_type", hypot_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/hypot.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/hypot.hpp
new file mode 100644
index 000000000000..5bc73e717ad3
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/hypot.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_hypot(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/imag.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/imag.cpp
new file mode 100644
index 000000000000..833295d22891
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/imag.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "imag.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/imag.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U16: ==== IMAG   (x)
+namespace impl
+{
+
+namespace imag_fn_ns = dpctl::tensor::kernels::imag;
+
+static unary_contig_impl_fn_ptr_t imag_contig_dispatch_vector[td_ns::num_types];
+static int imag_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    imag_strided_dispatch_vector[td_ns::num_types];
+
+void populate_imag_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = imag_fn_ns;
+
+    using fn_ns::ImagContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ImagContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(imag_contig_dispatch_vector);
+
+    using fn_ns::ImagStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ImagStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(imag_strided_dispatch_vector);
+
+    using fn_ns::ImagTypeMapFactory;
+    DispatchVectorBuilder<int, ImagTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(imag_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_imag(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_imag_dispatch_vectors();
+        using impl::imag_contig_dispatch_vector;
+        using impl::imag_output_typeid_vector;
+        using impl::imag_strided_dispatch_vector;
+
+        auto imag_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, imag_output_typeid_vector,
+                imag_contig_dispatch_vector, imag_strided_dispatch_vector);
+        };
+        m.def("_imag", imag_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto imag_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, imag_output_typeid_vector);
+        };
+        m.def("_imag_result_type", imag_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/imag.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/imag.hpp
new file mode 100644
index 000000000000..7cc285855328
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/imag.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_imag(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.cpp
new file mode 100644
index 000000000000..1882406b37f3
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.cpp
@@ -0,0 +1,128 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "isfinite.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/isfinite.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U17: ==== ISFINITE   (x)
+namespace impl
+{
+
+namespace isfinite_fn_ns = dpctl::tensor::kernels::isfinite;
+
+static unary_contig_impl_fn_ptr_t
+    isfinite_contig_dispatch_vector[td_ns::num_types];
+static int isfinite_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    isfinite_strided_dispatch_vector[td_ns::num_types];
+
+void populate_isfinite_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = isfinite_fn_ns;
+
+    using fn_ns::IsFiniteContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsFiniteContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(isfinite_contig_dispatch_vector);
+
+    using fn_ns::IsFiniteStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsFiniteStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(isfinite_strided_dispatch_vector);
+
+    using fn_ns::IsFiniteTypeMapFactory;
+    DispatchVectorBuilder<int, IsFiniteTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(isfinite_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_isfinite(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_isfinite_dispatch_vectors();
+        using impl::isfinite_contig_dispatch_vector;
+        using impl::isfinite_output_typeid_vector;
+        using impl::isfinite_strided_dispatch_vector;
+
+        auto isfinite_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                  sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  isfinite_output_typeid_vector,
+                                  isfinite_contig_dispatch_vector,
+                                  isfinite_strided_dispatch_vector);
+        };
+        m.def("_isfinite", isfinite_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto isfinite_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              isfinite_output_typeid_vector);
+        };
+        m.def("_isfinite_result_type", isfinite_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.hpp
new file mode 100644
index 000000000000..31691916c1f8
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/isfinite.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_isfinite(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/isinf.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/isinf.cpp
new file mode 100644
index 000000000000..b6bb5605412c
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/isinf.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "isinf.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/isinf.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U18: ==== ISINF   (x)
+namespace impl
+{
+
+namespace isinf_fn_ns = dpctl::tensor::kernels::isinf;
+
+static unary_contig_impl_fn_ptr_t
+    isinf_contig_dispatch_vector[td_ns::num_types];
+static int isinf_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    isinf_strided_dispatch_vector[td_ns::num_types];
+
+void populate_isinf_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = isinf_fn_ns;
+
+    using fn_ns::IsInfContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsInfContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(isinf_contig_dispatch_vector);
+
+    using fn_ns::IsInfStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsInfStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(isinf_strided_dispatch_vector);
+
+    using fn_ns::IsInfTypeMapFactory;
+    DispatchVectorBuilder<int, IsInfTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(isinf_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_isinf(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_isinf_dispatch_vectors();
+        using impl::isinf_contig_dispatch_vector;
+        using impl::isinf_output_typeid_vector;
+        using impl::isinf_strided_dispatch_vector;
+
+        auto isinf_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, isinf_output_typeid_vector,
+                isinf_contig_dispatch_vector, isinf_strided_dispatch_vector);
+        };
+        m.def("_isinf", isinf_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto isinf_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              isinf_output_typeid_vector);
+        };
+        m.def("_isinf_result_type", isinf_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/isinf.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/isinf.hpp
new file mode 100644
index 000000000000..3dec9f20c791
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/isinf.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_isinf(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/isnan.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/isnan.cpp
new file mode 100644
index 000000000000..ce832d0a0ed3
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/isnan.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "isnan.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/isnan.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U19: ==== ISNAN   (x)
+namespace impl
+{
+
+namespace isnan_fn_ns = dpctl::tensor::kernels::isnan;
+
+static unary_contig_impl_fn_ptr_t
+    isnan_contig_dispatch_vector[td_ns::num_types];
+static int isnan_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    isnan_strided_dispatch_vector[td_ns::num_types];
+
+void populate_isnan_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = isnan_fn_ns;
+
+    using fn_ns::IsNanContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsNanContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(isnan_contig_dispatch_vector);
+
+    using fn_ns::IsNanStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsNanStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(isnan_strided_dispatch_vector);
+
+    using fn_ns::IsNanTypeMapFactory;
+    DispatchVectorBuilder<int, IsNanTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(isnan_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_isnan(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_isnan_dispatch_vectors();
+        using impl::isnan_contig_dispatch_vector;
+        using impl::isnan_output_typeid_vector;
+        using impl::isnan_strided_dispatch_vector;
+
+        auto isnan_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, isnan_output_typeid_vector,
+                isnan_contig_dispatch_vector, isnan_strided_dispatch_vector);
+        };
+        m.def("_isnan", isnan_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto isnan_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              isnan_output_typeid_vector);
+        };
+        m.def("_isnan_result_type", isnan_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/isnan.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/isnan.hpp
new file mode 100644
index 000000000000..d5a8cdae37e8
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/isnan.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_isnan(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/less.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/less.cpp
new file mode 100644
index 000000000000..d587ee713178
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/less.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "less.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/less.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B13: ===== LESS (x1, x2)
+namespace impl
+{
+namespace less_fn_ns = dpctl::tensor::kernels::less;
+
+static binary_contig_impl_fn_ptr_t less_contig_dispatch_table[td_ns::num_types]
+                                                             [td_ns::num_types];
+static int less_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    less_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_less_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = less_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LessTypeMapFactory;
+    DispatchTableBuilder<int, LessTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(less_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LessStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LessStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(less_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LessContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LessContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(less_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_less(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_less_dispatch_tables();
+        using impl::less_contig_dispatch_table;
+        using impl::less_output_id_table;
+        using impl::less_strided_dispatch_table;
+
+        auto less_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                              const arrayT &dst, sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, less_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                less_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                less_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto less_result_type_pyapi = [&](const py::dtype &dtype1,
+                                          const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               less_output_id_table);
+        };
+        m.def("_less", less_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_less_result_type", less_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/less.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/less.hpp
new file mode 100644
index 000000000000..e08d84f380da
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/less.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_less(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.cpp
new file mode 100644
index 000000000000..433969cead27
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "less_equal.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/less_equal.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B14: ===== LESS_EQUAL (x1, x2)
+namespace impl
+{
+namespace less_equal_fn_ns = dpctl::tensor::kernels::less_equal;
+
+static binary_contig_impl_fn_ptr_t
+    less_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int less_equal_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    less_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_less_equal_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = less_equal_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LessEqualTypeMapFactory;
+    DispatchTableBuilder<int, LessEqualTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(less_equal_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LessEqualStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LessEqualStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(less_equal_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LessEqualContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LessEqualContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(less_equal_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_less_equal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_less_equal_dispatch_tables();
+        using impl::less_equal_contig_dispatch_table;
+        using impl::less_equal_output_id_table;
+        using impl::less_equal_strided_dispatch_table;
+
+        auto less_equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                    const arrayT &dst, sycl::queue &exec_q,
+                                    const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, less_equal_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                less_equal_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                less_equal_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto less_equal_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               less_equal_output_id_table);
+        };
+        m.def("_less_equal", less_equal_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_less_equal_result_type", less_equal_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.hpp
new file mode 100644
index 000000000000..8eeb837a35a7
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/less_equal.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_less_equal(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/log.cpp
new file mode 100644
index 000000000000..2906304eaffa
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/log.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "log.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/log.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U20: ==== LOG   (x)
+namespace impl
+{
+
+namespace log_fn_ns = dpctl::tensor::kernels::log;
+
+static unary_contig_impl_fn_ptr_t log_contig_dispatch_vector[td_ns::num_types];
+static int log_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    log_strided_dispatch_vector[td_ns::num_types];
+
+void populate_log_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = log_fn_ns;
+
+    using fn_ns::LogContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, LogContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(log_contig_dispatch_vector);
+
+    using fn_ns::LogStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, LogStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(log_strided_dispatch_vector);
+
+    using fn_ns::LogTypeMapFactory;
+    DispatchVectorBuilder<int, LogTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(log_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_log(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_log_dispatch_vectors();
+        using impl::log_contig_dispatch_vector;
+        using impl::log_output_typeid_vector;
+        using impl::log_strided_dispatch_vector;
+
+        auto log_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, log_output_typeid_vector,
+                log_contig_dispatch_vector, log_strided_dispatch_vector);
+        };
+        m.def("_log", log_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto log_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, log_output_typeid_vector);
+        };
+        m.def("_log_result_type", log_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/log.hpp
new file mode 100644
index 000000000000..fb065e82e037
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/log.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_log(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log10.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/log10.cpp
new file mode 100644
index 000000000000..9501af987341
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/log10.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "log10.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/log10.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U23: ==== LOG10   (x)
+namespace impl
+{
+
+namespace log10_fn_ns = dpctl::tensor::kernels::log10;
+
+static unary_contig_impl_fn_ptr_t
+    log10_contig_dispatch_vector[td_ns::num_types];
+static int log10_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    log10_strided_dispatch_vector[td_ns::num_types];
+
+void populate_log10_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = log10_fn_ns;
+
+    using fn_ns::Log10ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log10ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(log10_contig_dispatch_vector);
+
+    using fn_ns::Log10StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log10StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(log10_strided_dispatch_vector);
+
+    using fn_ns::Log10TypeMapFactory;
+    DispatchVectorBuilder<int, Log10TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(log10_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_log10(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_log10_dispatch_vectors();
+        using impl::log10_contig_dispatch_vector;
+        using impl::log10_output_typeid_vector;
+        using impl::log10_strided_dispatch_vector;
+
+        auto log10_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, log10_output_typeid_vector,
+                log10_contig_dispatch_vector, log10_strided_dispatch_vector);
+        };
+        m.def("_log10", log10_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto log10_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              log10_output_typeid_vector);
+        };
+        m.def("_log10_result_type", log10_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log10.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/log10.hpp
new file mode 100644
index 000000000000..779b15472462
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/log10.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_log10(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log1p.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/log1p.cpp
new file mode 100644
index 000000000000..c94b3f3b5d7d
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/log1p.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "log1p.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/log1p.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U21: ==== LOG1P   (x)
+namespace impl
+{
+
+namespace log1p_fn_ns = dpctl::tensor::kernels::log1p;
+
+static unary_contig_impl_fn_ptr_t
+    log1p_contig_dispatch_vector[td_ns::num_types];
+static int log1p_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    log1p_strided_dispatch_vector[td_ns::num_types];
+
+void populate_log1p_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = log1p_fn_ns;
+
+    using fn_ns::Log1pContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log1pContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(log1p_contig_dispatch_vector);
+
+    using fn_ns::Log1pStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log1pStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(log1p_strided_dispatch_vector);
+
+    using fn_ns::Log1pTypeMapFactory;
+    DispatchVectorBuilder<int, Log1pTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(log1p_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_log1p(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_log1p_dispatch_vectors();
+        using impl::log1p_contig_dispatch_vector;
+        using impl::log1p_output_typeid_vector;
+        using impl::log1p_strided_dispatch_vector;
+
+        auto log1p_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, log1p_output_typeid_vector,
+                log1p_contig_dispatch_vector, log1p_strided_dispatch_vector);
+        };
+        m.def("_log1p", log1p_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto log1p_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              log1p_output_typeid_vector);
+        };
+        m.def("_log1p_result_type", log1p_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log1p.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/log1p.hpp
new file mode 100644
index 000000000000..85bf21c8ea48
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/log1p.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_log1p(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log2.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/log2.cpp
new file mode 100644
index 000000000000..825d516f7820
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/log2.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "log2.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/log2.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U22: ==== LOG2   (x)
+namespace impl
+{
+
+namespace log2_fn_ns = dpctl::tensor::kernels::log2;
+
+static unary_contig_impl_fn_ptr_t log2_contig_dispatch_vector[td_ns::num_types];
+static int log2_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    log2_strided_dispatch_vector[td_ns::num_types];
+
+void populate_log2_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = log2_fn_ns;
+
+    using fn_ns::Log2ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log2ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(log2_contig_dispatch_vector);
+
+    using fn_ns::Log2StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log2StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(log2_strided_dispatch_vector);
+
+    using fn_ns::Log2TypeMapFactory;
+    DispatchVectorBuilder<int, Log2TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(log2_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_log2(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_log2_dispatch_vectors();
+        using impl::log2_contig_dispatch_vector;
+        using impl::log2_output_typeid_vector;
+        using impl::log2_strided_dispatch_vector;
+
+        auto log2_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, log2_output_typeid_vector,
+                log2_contig_dispatch_vector, log2_strided_dispatch_vector);
+        };
+        m.def("_log2", log2_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto log2_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, log2_output_typeid_vector);
+        };
+        m.def("_log2_result_type", log2_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/log2.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/log2.hpp
new file mode 100644
index 000000000000..11f757b1449d
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/log2.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_log2(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.cpp
new file mode 100644
index 000000000000..71bc9cad4035
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "logaddexp.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logaddexp.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B15: ===== LOGADDEXP (x1, x2)
+namespace impl
+{
+namespace logaddexp_fn_ns = dpctl::tensor::kernels::logaddexp;
+
+static binary_contig_impl_fn_ptr_t
+    logaddexp_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logaddexp_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logaddexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logaddexp_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logaddexp_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogAddExpTypeMapFactory;
+    DispatchTableBuilder<int, LogAddExpTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logaddexp_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogAddExpStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogAddExpStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logaddexp_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogAddExpContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogAddExpContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logaddexp_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logaddexp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logaddexp_dispatch_tables();
+        using impl::logaddexp_contig_dispatch_table;
+        using impl::logaddexp_output_id_table;
+        using impl::logaddexp_strided_dispatch_table;
+
+        auto logaddexp_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logaddexp_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logaddexp_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logaddexp_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logaddexp_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logaddexp_output_id_table);
+        };
+        m.def("_logaddexp", logaddexp_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logaddexp_result_type", logaddexp_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.hpp
new file mode 100644
index 000000000000..2c4efa7d0c56
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/logaddexp.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logaddexp(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.cpp
new file mode 100644
index 000000000000..90c0b52a6aa2
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "logical_and.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_and.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B16: ===== LOGICAL_AND (x1, x2)
+namespace impl
+{
+namespace logical_and_fn_ns = dpctl::tensor::kernels::logical_and;
+
+static binary_contig_impl_fn_ptr_t
+    logical_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logical_and_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logical_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logical_and_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_and_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogicalAndTypeMapFactory;
+    DispatchTableBuilder<int, LogicalAndTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logical_and_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogicalAndStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalAndStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logical_and_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogicalAndContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalAndContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logical_and_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logical_and(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_and_dispatch_tables();
+        using impl::logical_and_contig_dispatch_table;
+        using impl::logical_and_output_id_table;
+        using impl::logical_and_strided_dispatch_table;
+
+        auto logical_and_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logical_and_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logical_and_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logical_and_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logical_and_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logical_and_output_id_table);
+        };
+        m.def("_logical_and", logical_and_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logical_and_result_type", logical_and_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.hpp
new file mode 100644
index 000000000000..c22a98f24146
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_and.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logical_and(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.cpp
new file mode 100644
index 000000000000..e8f5845fac16
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.cpp
@@ -0,0 +1,129 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "logical_not.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_not.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U24: ==== LOGICAL_NOT   (x)
+namespace impl
+{
+
+namespace logical_not_fn_ns = dpctl::tensor::kernels::logical_not;
+
+static unary_contig_impl_fn_ptr_t
+    logical_not_contig_dispatch_vector[td_ns::num_types];
+static int logical_not_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    logical_not_strided_dispatch_vector[td_ns::num_types];
+
+void populate_logical_not_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_not_fn_ns;
+
+    using fn_ns::LogicalNotContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, LogicalNotContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(logical_not_contig_dispatch_vector);
+
+    using fn_ns::LogicalNotStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, LogicalNotStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(logical_not_strided_dispatch_vector);
+
+    using fn_ns::LogicalNotTypeMapFactory;
+    DispatchVectorBuilder<int, LogicalNotTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(logical_not_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_logical_not(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_not_dispatch_vectors();
+        using impl::logical_not_contig_dispatch_vector;
+        using impl::logical_not_output_typeid_vector;
+        using impl::logical_not_strided_dispatch_vector;
+
+        auto logical_not_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                     sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  logical_not_output_typeid_vector,
+                                  logical_not_contig_dispatch_vector,
+                                  logical_not_strided_dispatch_vector);
+        };
+        m.def("_logical_not", logical_not_pyapi, "", py::arg("src"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+
+        auto logical_not_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              logical_not_output_typeid_vector);
+        };
+        m.def("_logical_not_result_type", logical_not_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.hpp
new file mode 100644
index 000000000000..f3bb79cc28cc
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_not.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logical_not(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.cpp
new file mode 100644
index 000000000000..38c981792345
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "logical_or.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_or.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B17: ===== LOGICAL_OR (x1, x2)
+namespace impl
+{
+namespace logical_or_fn_ns = dpctl::tensor::kernels::logical_or;
+
+static binary_contig_impl_fn_ptr_t
+    logical_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logical_or_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logical_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logical_or_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_or_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogicalOrTypeMapFactory;
+    DispatchTableBuilder<int, LogicalOrTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logical_or_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogicalOrStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalOrStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logical_or_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogicalOrContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalOrContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logical_or_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logical_or(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_or_dispatch_tables();
+        using impl::logical_or_contig_dispatch_table;
+        using impl::logical_or_output_id_table;
+        using impl::logical_or_strided_dispatch_table;
+
+        auto logical_or_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                    const arrayT &dst, sycl::queue &exec_q,
+                                    const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logical_or_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logical_or_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logical_or_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logical_or_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logical_or_output_id_table);
+        };
+        m.def("_logical_or", logical_or_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logical_or_result_type", logical_or_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.hpp
new file mode 100644
index 000000000000..11e83fe8cedf
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_or.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logical_or(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
new file mode 100644
index 000000000000..759133ca6120
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "logical_xor.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_xor.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B18: ===== LOGICAL_XOR (x1, x2)
+namespace impl
+{
+namespace logical_xor_fn_ns = dpctl::tensor::kernels::logical_xor;
+
+static binary_contig_impl_fn_ptr_t
+    logical_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logical_xor_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logical_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logical_xor_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_xor_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogicalXorTypeMapFactory;
+    DispatchTableBuilder<int, LogicalXorTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logical_xor_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogicalXorStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalXorStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logical_xor_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogicalXorContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalXorContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logical_xor_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logical_xor(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_xor_dispatch_tables();
+        using impl::logical_xor_contig_dispatch_table;
+        using impl::logical_xor_output_id_table;
+        using impl::logical_xor_strided_dispatch_table;
+
+        auto logical_xor_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logical_xor_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logical_xor_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logical_xor_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logical_xor_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logical_xor_output_id_table);
+        };
+        m.def("_logical_xor", logical_xor_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logical_xor_result_type", logical_xor_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
new file mode 100644
index 000000000000..24c163249128
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logical_xor(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/maximum.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/maximum.cpp
new file mode 100644
index 000000000000..8fda65c43dca
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/maximum.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "maximum.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/maximum.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B26: ===== MAXIMUM (x1, x2)
+namespace impl
+{
+namespace maximum_fn_ns = dpctl::tensor::kernels::maximum;
+
+static binary_contig_impl_fn_ptr_t
+    maximum_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int maximum_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    maximum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_maximum_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = maximum_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::MaximumTypeMapFactory;
+    DispatchTableBuilder<int, MaximumTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(maximum_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::MaximumStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MaximumStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(maximum_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::MaximumContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MaximumContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(maximum_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_maximum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_maximum_dispatch_tables();
+        using impl::maximum_contig_dispatch_table;
+        using impl::maximum_output_id_table;
+        using impl::maximum_strided_dispatch_table;
+
+        auto maximum_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                 const arrayT &dst, sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, maximum_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                maximum_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                maximum_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto maximum_result_type_pyapi = [&](const py::dtype &dtype1,
+                                             const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               maximum_output_id_table);
+        };
+        m.def("_maximum", maximum_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_maximum_result_type", maximum_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/maximum.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/maximum.hpp
new file mode 100644
index 000000000000..1f8fc027ac1d
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/maximum.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_maximum(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/minimum.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/minimum.cpp
new file mode 100644
index 000000000000..7055ce5c72f5
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/minimum.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "minimum.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/minimum.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B27: ===== MINIMUM (x1, x2)
+namespace impl
+{
+namespace minimum_fn_ns = dpctl::tensor::kernels::minimum;
+
+static binary_contig_impl_fn_ptr_t
+    minimum_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int minimum_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    minimum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_minimum_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = minimum_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::MinimumTypeMapFactory;
+    DispatchTableBuilder<int, MinimumTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(minimum_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::MinimumStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MinimumStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(minimum_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::MinimumContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MinimumContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(minimum_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_minimum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_minimum_dispatch_tables();
+        using impl::minimum_contig_dispatch_table;
+        using impl::minimum_output_id_table;
+        using impl::minimum_strided_dispatch_table;
+
+        auto minimum_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                 const arrayT &dst, sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, minimum_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                minimum_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                minimum_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto minimum_result_type_pyapi = [&](const py::dtype &dtype1,
+                                             const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               minimum_output_id_table);
+        };
+        m.def("_minimum", minimum_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_minimum_result_type", minimum_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/minimum.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/minimum.hpp
new file mode 100644
index 000000000000..be2e18a9b37c
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/minimum.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_minimum(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/multiply.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/multiply.cpp
new file mode 100644
index 000000000000..5d25f8cc7b19
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/multiply.cpp
@@ -0,0 +1,244 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "multiply.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/multiply.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B19: ===== MULTIPLY (x1, x2)
+namespace impl
+{
+
+namespace multiply_fn_ns = dpctl::tensor::kernels::multiply;
+
+static binary_contig_impl_fn_ptr_t
+    multiply_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int multiply_output_id_table[td_ns::num_types][td_ns::num_types];
+static int multiply_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    multiply_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// mul(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    multiply_contig_matrix_contig_row_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+// mul(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    multiply_contig_row_contig_matrix_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    multiply_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    multiply_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    multiply_inplace_row_matrix_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_multiply_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = multiply_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::MultiplyTypeMapFactory;
+    DispatchTableBuilder<int, MultiplyTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(multiply_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::MultiplyStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MultiplyStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(multiply_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::MultiplyContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MultiplyContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(multiply_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::MultiplyContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        MultiplyContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        multiply_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::MultiplyContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        MultiplyContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        multiply_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::MultiplyInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         MultiplyInplaceStridedFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(multiply_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::MultiplyInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         MultiplyInplaceContigFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(multiply_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::MultiplyInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         MultiplyInplaceRowMatrixBroadcastFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(multiply_inplace_row_matrix_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::MultiplyInplaceTypeMapFactory;
+    DispatchTableBuilder<int, MultiplyInplaceTypeMapFactory, num_types> dtb9;
+    dtb9.populate_dispatch_table(multiply_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_multiply(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_multiply_dispatch_tables();
+        using impl::multiply_contig_dispatch_table;
+        using impl::multiply_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::multiply_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::multiply_output_id_table;
+        using impl::multiply_strided_dispatch_table;
+
+        auto multiply_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, multiply_output_id_table,
+                // function pointers to handle operation on contiguous
+                // arrays (pointers may be nullptr)
+                multiply_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays
+                // (most general case)
+                multiply_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                multiply_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                multiply_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto multiply_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               multiply_output_id_table);
+        };
+        m.def("_multiply", multiply_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_multiply_result_type", multiply_result_type_pyapi, "");
+
+        using impl::multiply_inplace_contig_dispatch_table;
+        using impl::multiply_inplace_output_id_table;
+        using impl::multiply_inplace_row_matrix_dispatch_table;
+        using impl::multiply_inplace_strided_dispatch_table;
+
+        auto multiply_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                          sycl::queue &exec_q,
+                                          const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, multiply_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                multiply_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                multiply_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                multiply_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_multiply_inplace", multiply_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/multiply.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/multiply.hpp
new file mode 100644
index 000000000000..a4ed946a8501
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/multiply.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_multiply(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/negative.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/negative.cpp
new file mode 100644
index 000000000000..8510a15eab00
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/negative.cpp
@@ -0,0 +1,128 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "negative.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/negative.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U25: ==== NEGATIVE   (x)
+namespace impl
+{
+
+namespace negative_fn_ns = dpctl::tensor::kernels::negative;
+
+static unary_contig_impl_fn_ptr_t
+    negative_contig_dispatch_vector[td_ns::num_types];
+static int negative_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    negative_strided_dispatch_vector[td_ns::num_types];
+
+void populate_negative_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = negative_fn_ns;
+
+    using fn_ns::NegativeContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, NegativeContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(negative_contig_dispatch_vector);
+
+    using fn_ns::NegativeStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, NegativeStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(negative_strided_dispatch_vector);
+
+    using fn_ns::NegativeTypeMapFactory;
+    DispatchVectorBuilder<int, NegativeTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(negative_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_negative(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_negative_dispatch_vectors();
+        using impl::negative_contig_dispatch_vector;
+        using impl::negative_output_typeid_vector;
+        using impl::negative_strided_dispatch_vector;
+
+        auto negative_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                  sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  negative_output_typeid_vector,
+                                  negative_contig_dispatch_vector,
+                                  negative_strided_dispatch_vector);
+        };
+        m.def("_negative", negative_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto negative_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              negative_output_typeid_vector);
+        };
+        m.def("_negative_result_type", negative_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/negative.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/negative.hpp
new file mode 100644
index 000000000000..083df516b435
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/negative.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_negative(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.cpp
new file mode 100644
index 000000000000..42e1ac9bd4c3
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "nextafter.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/nextafter.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B28: ===== NEXTAFTER (x1, x2)
+namespace impl
+{
+namespace nextafter_fn_ns = dpctl::tensor::kernels::nextafter;
+
+static binary_contig_impl_fn_ptr_t
+    nextafter_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int nextafter_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    nextafter_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_nextafter_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = nextafter_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::NextafterTypeMapFactory;
+    DispatchTableBuilder<int, NextafterTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(nextafter_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::NextafterStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, NextafterStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(nextafter_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::NextafterContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, NextafterContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(nextafter_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_nextafter(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_nextafter_dispatch_tables();
+        using impl::nextafter_contig_dispatch_table;
+        using impl::nextafter_output_id_table;
+        using impl::nextafter_strided_dispatch_table;
+
+        auto nextafter_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, nextafter_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                nextafter_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                nextafter_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto nextafter_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               nextafter_output_id_table);
+        };
+        m.def("_nextafter", nextafter_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_nextafter_result_type", nextafter_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.hpp
new file mode 100644
index 000000000000..76ad701d4012
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/nextafter.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_nextafter(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.cpp
new file mode 100644
index 000000000000..dcbbf0cf015e
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.cpp
@@ -0,0 +1,146 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "not_equal.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/not_equal.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B20: ===== NOT_EQUAL (x1, x2)
+namespace impl
+{
+namespace not_equal_fn_ns = dpctl::tensor::kernels::not_equal;
+
+static binary_contig_impl_fn_ptr_t
+    not_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int not_equal_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    not_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_not_equal_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = not_equal_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::NotEqualTypeMapFactory;
+    DispatchTableBuilder<int, NotEqualTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(not_equal_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::NotEqualStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, NotEqualStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(not_equal_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::NotEqualContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, NotEqualContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(not_equal_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_not_equal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_not_equal_dispatch_tables();
+        using impl::not_equal_contig_dispatch_table;
+        using impl::not_equal_output_id_table;
+        using impl::not_equal_strided_dispatch_table;
+
+        auto not_equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, not_equal_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                not_equal_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                not_equal_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto not_equal_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               not_equal_output_id_table);
+        };
+        m.def("_not_equal", not_equal_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_not_equal_result_type", not_equal_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.hpp
new file mode 100644
index 000000000000..c6c99bb793bc
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/not_equal.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_not_equal(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/positive.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/positive.cpp
new file mode 100644
index 000000000000..6518b10a77c0
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/positive.cpp
@@ -0,0 +1,128 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "positive.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/positive.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U26: ==== POSITIVE   (x)
+namespace impl
+{
+
+namespace positive_fn_ns = dpctl::tensor::kernels::positive;
+
+static unary_contig_impl_fn_ptr_t
+    positive_contig_dispatch_vector[td_ns::num_types];
+static int positive_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    positive_strided_dispatch_vector[td_ns::num_types];
+
+void populate_positive_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = positive_fn_ns;
+
+    using fn_ns::PositiveContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, PositiveContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(positive_contig_dispatch_vector);
+
+    using fn_ns::PositiveStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, PositiveStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(positive_strided_dispatch_vector);
+
+    using fn_ns::PositiveTypeMapFactory;
+    DispatchVectorBuilder<int, PositiveTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(positive_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_positive(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_positive_dispatch_vectors();
+        using impl::positive_contig_dispatch_vector;
+        using impl::positive_output_typeid_vector;
+        using impl::positive_strided_dispatch_vector;
+
+        auto positive_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                  sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  positive_output_typeid_vector,
+                                  positive_contig_dispatch_vector,
+                                  positive_strided_dispatch_vector);
+        };
+        m.def("_positive", positive_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto positive_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              positive_output_typeid_vector);
+        };
+        m.def("_positive_result_type", positive_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/positive.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/positive.hpp
new file mode 100644
index 000000000000..05bd04b577af
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/positive.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_positive(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/pow.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/pow.cpp
new file mode 100644
index 000000000000..990515fa5402
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/pow.cpp
@@ -0,0 +1,203 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "pow.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/pow.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B21: ===== POW (x1, x2)
+namespace impl
+{
+
+namespace pow_fn_ns = dpctl::tensor::kernels::pow;
+
+static binary_contig_impl_fn_ptr_t pow_contig_dispatch_table[td_ns::num_types]
+                                                            [td_ns::num_types];
+
+static int pow_output_id_table[td_ns::num_types][td_ns::num_types];
+static int pow_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    pow_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    pow_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    pow_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_pow_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = pow_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::PowTypeMapFactory;
+    DispatchTableBuilder<int, PowTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(pow_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::PowStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, PowStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(pow_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::PowContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, PowContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(pow_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::PowInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         PowInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(pow_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::PowInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         PowInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(pow_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::PowInplaceTypeMapFactory;
+    DispatchTableBuilder<int, PowInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(pow_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_pow(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_pow_dispatch_tables();
+        using impl::pow_contig_dispatch_table;
+        using impl::pow_output_id_table;
+        using impl::pow_strided_dispatch_table;
+
+        auto pow_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, pow_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                pow_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                pow_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto pow_result_type_pyapi = [&](const py::dtype &dtype1,
+                                         const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               pow_output_id_table);
+        };
+        m.def("_pow", pow_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_pow_result_type", pow_result_type_pyapi, "");
+
+        using impl::pow_inplace_contig_dispatch_table;
+        using impl::pow_inplace_output_id_table;
+        using impl::pow_inplace_strided_dispatch_table;
+
+        auto pow_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                     sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, pow_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                pow_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                pow_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_pow_inplace", pow_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/pow.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/pow.hpp
new file mode 100644
index 000000000000..197a23b80d8a
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/pow.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_pow(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/proj.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/proj.cpp
new file mode 100644
index 000000000000..9583de8bd195
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/proj.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "proj.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/proj.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U40: ==== PROJ   (x)
+namespace impl
+{
+
+namespace proj_fn_ns = dpctl::tensor::kernels::proj;
+
+static unary_contig_impl_fn_ptr_t proj_contig_dispatch_vector[td_ns::num_types];
+static int proj_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    proj_strided_dispatch_vector[td_ns::num_types];
+
+void populate_proj_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = proj_fn_ns;
+
+    using fn_ns::ProjContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ProjContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(proj_contig_dispatch_vector);
+
+    using fn_ns::ProjStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ProjStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(proj_strided_dispatch_vector);
+
+    using fn_ns::ProjTypeMapFactory;
+    DispatchVectorBuilder<int, ProjTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(proj_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_proj(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_proj_dispatch_vectors();
+        using impl::proj_contig_dispatch_vector;
+        using impl::proj_output_typeid_vector;
+        using impl::proj_strided_dispatch_vector;
+
+        auto proj_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, proj_output_typeid_vector,
+                proj_contig_dispatch_vector, proj_strided_dispatch_vector);
+        };
+        m.def("_proj", proj_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto proj_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, proj_output_typeid_vector);
+        };
+        m.def("_proj_result_type", proj_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/proj.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/proj.hpp
new file mode 100644
index 000000000000..3cdc0e8271b0
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/proj.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_proj(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/real.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/real.cpp
new file mode 100644
index 000000000000..6ed3f5fdc404
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/real.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "real.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/real.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U27: ==== REAL   (x)
+namespace impl
+{
+
+namespace real_fn_ns = dpctl::tensor::kernels::real;
+
+static unary_contig_impl_fn_ptr_t real_contig_dispatch_vector[td_ns::num_types];
+static int real_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    real_strided_dispatch_vector[td_ns::num_types];
+
+void populate_real_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = real_fn_ns;
+
+    using fn_ns::RealContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RealContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(real_contig_dispatch_vector);
+
+    using fn_ns::RealStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RealStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(real_strided_dispatch_vector);
+
+    using fn_ns::RealTypeMapFactory;
+    DispatchVectorBuilder<int, RealTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(real_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_real(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_real_dispatch_vectors();
+        using impl::real_contig_dispatch_vector;
+        using impl::real_output_typeid_vector;
+        using impl::real_strided_dispatch_vector;
+
+        auto real_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, real_output_typeid_vector,
+                real_contig_dispatch_vector, real_strided_dispatch_vector);
+        };
+        m.def("_real", real_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto real_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, real_output_typeid_vector);
+        };
+        m.def("_real_result_type", real_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/real.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/real.hpp
new file mode 100644
index 000000000000..81f4743e823b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/real.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_real(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.cpp
new file mode 100644
index 000000000000..cdb0f43dfbe0
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.cpp
@@ -0,0 +1,129 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "reciprocal.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/reciprocal.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U42: ==== REAL   (x)
+namespace impl
+{
+
+namespace reciprocal_fn_ns = dpctl::tensor::kernels::reciprocal;
+
+static unary_contig_impl_fn_ptr_t
+    reciprocal_contig_dispatch_vector[td_ns::num_types];
+static int reciprocal_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    reciprocal_strided_dispatch_vector[td_ns::num_types];
+
+void populate_reciprocal_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = reciprocal_fn_ns;
+
+    using fn_ns::ReciprocalContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ReciprocalContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(reciprocal_contig_dispatch_vector);
+
+    using fn_ns::ReciprocalStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ReciprocalStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(reciprocal_strided_dispatch_vector);
+
+    using fn_ns::ReciprocalTypeMapFactory;
+    DispatchVectorBuilder<int, ReciprocalTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(reciprocal_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_reciprocal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_reciprocal_dispatch_vectors();
+        using impl::reciprocal_contig_dispatch_vector;
+        using impl::reciprocal_output_typeid_vector;
+        using impl::reciprocal_strided_dispatch_vector;
+
+        auto reciprocal_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                    sycl::queue &exec_q,
+                                    const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  reciprocal_output_typeid_vector,
+                                  reciprocal_contig_dispatch_vector,
+                                  reciprocal_strided_dispatch_vector);
+        };
+        m.def("_reciprocal", reciprocal_pyapi, "", py::arg("src"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+
+        auto reciprocal_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              reciprocal_output_typeid_vector);
+        };
+        m.def("_reciprocal_result_type", reciprocal_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.hpp
new file mode 100644
index 000000000000..1d2156f3464e
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/reciprocal.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_reciprocal(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/remainder.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/remainder.cpp
new file mode 100644
index 000000000000..8bdcdbe1b3dd
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/remainder.cpp
@@ -0,0 +1,205 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "remainder.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/remainder.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B22: ===== REMAINDER (x1, x2)
+namespace impl
+{
+
+namespace remainder_fn_ns = dpctl::tensor::kernels::remainder;
+
+static binary_contig_impl_fn_ptr_t
+    remainder_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int remainder_output_id_table[td_ns::num_types][td_ns::num_types];
+static int remainder_inplace_output_id_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    remainder_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    remainder_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    remainder_inplace_strided_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+void populate_remainder_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = remainder_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::RemainderTypeMapFactory;
+    DispatchTableBuilder<int, RemainderTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(remainder_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::RemainderStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, RemainderStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(remainder_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::RemainderContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, RemainderContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(remainder_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::RemainderInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         RemainderInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(remainder_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::RemainderInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         RemainderInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(remainder_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::RemainderInplaceTypeMapFactory;
+    DispatchTableBuilder<int, RemainderInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(remainder_inplace_output_id_table);
+}
+
+} // namespace impl
+
+void init_remainder(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_remainder_dispatch_tables();
+        using impl::remainder_contig_dispatch_table;
+        using impl::remainder_output_id_table;
+        using impl::remainder_strided_dispatch_table;
+
+        auto remainder_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, remainder_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                remainder_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                remainder_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto remainder_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               remainder_output_id_table);
+        };
+        m.def("_remainder", remainder_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_remainder_result_type", remainder_result_type_pyapi, "");
+
+        using impl::remainder_inplace_contig_dispatch_table;
+        using impl::remainder_inplace_output_id_table;
+        using impl::remainder_inplace_strided_dispatch_table;
+
+        auto remainder_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                           sycl::queue &exec_q,
+                                           const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, remainder_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                remainder_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                remainder_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_remainder_inplace", remainder_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/remainder.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/remainder.hpp
new file mode 100644
index 000000000000..c00bdc9e0e6c
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/remainder.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_remainder(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/round.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/round.cpp
new file mode 100644
index 000000000000..d651b567c3c1
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/round.cpp
@@ -0,0 +1,126 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "round.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/round.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U28: ==== ROUND   (x)
+namespace impl
+{
+
+namespace round_fn_ns = dpctl::tensor::kernels::round;
+
+static unary_contig_impl_fn_ptr_t
+    round_contig_dispatch_vector[td_ns::num_types];
+static int round_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    round_strided_dispatch_vector[td_ns::num_types];
+
+void populate_round_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = round_fn_ns;
+
+    using fn_ns::RoundContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RoundContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(round_contig_dispatch_vector);
+
+    using fn_ns::RoundStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RoundStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(round_strided_dispatch_vector);
+
+    using fn_ns::RoundTypeMapFactory;
+    DispatchVectorBuilder<int, RoundTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(round_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_round(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_round_dispatch_vectors();
+        using impl::round_contig_dispatch_vector;
+        using impl::round_output_typeid_vector;
+        using impl::round_strided_dispatch_vector;
+
+        auto round_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, round_output_typeid_vector,
+                round_contig_dispatch_vector, round_strided_dispatch_vector);
+        };
+        m.def("_round", round_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto round_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              round_output_typeid_vector);
+        };
+        m.def("_round_result_type", round_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/round.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/round.hpp
new file mode 100644
index 000000000000..ca56e110eec5
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/round.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_round(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.cpp
new file mode 100644
index 000000000000..738bef333d75
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "rsqrt.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/rsqrt.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U39: ==== RSQRT   (x)
+namespace impl
+{
+
+namespace rsqrt_fn_ns = dpctl::tensor::kernels::rsqrt;
+
+static unary_contig_impl_fn_ptr_t
+    rsqrt_contig_dispatch_vector[td_ns::num_types];
+static int rsqrt_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    rsqrt_strided_dispatch_vector[td_ns::num_types];
+
+void populate_rsqrt_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = rsqrt_fn_ns;
+
+    using fn_ns::RsqrtContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RsqrtContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(rsqrt_contig_dispatch_vector);
+
+    using fn_ns::RsqrtStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RsqrtStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(rsqrt_strided_dispatch_vector);
+
+    using fn_ns::RsqrtTypeMapFactory;
+    DispatchVectorBuilder<int, RsqrtTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(rsqrt_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_rsqrt(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_rsqrt_dispatch_vectors();
+        using impl::rsqrt_contig_dispatch_vector;
+        using impl::rsqrt_output_typeid_vector;
+        using impl::rsqrt_strided_dispatch_vector;
+
+        auto rsqrt_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, rsqrt_output_typeid_vector,
+                rsqrt_contig_dispatch_vector, rsqrt_strided_dispatch_vector);
+        };
+        m.def("_rsqrt", rsqrt_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto rsqrt_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              rsqrt_output_typeid_vector);
+        };
+        m.def("_rsqrt_result_type", rsqrt_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.hpp
new file mode 100644
index 000000000000..4ba740a31777
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/rsqrt.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_rsqrt(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sign.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/sign.cpp
new file mode 100644
index 000000000000..5051926e7470
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/sign.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "sign.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/sign.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U29: ==== SIGN   (x)
+namespace impl
+{
+
+namespace sign_fn_ns = dpctl::tensor::kernels::sign;
+
+static unary_contig_impl_fn_ptr_t sign_contig_dispatch_vector[td_ns::num_types];
+static int sign_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    sign_strided_dispatch_vector[td_ns::num_types];
+
+void populate_sign_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = sign_fn_ns;
+
+    using fn_ns::SignContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SignContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(sign_contig_dispatch_vector);
+
+    using fn_ns::SignStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SignStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(sign_strided_dispatch_vector);
+
+    using fn_ns::SignTypeMapFactory;
+    DispatchVectorBuilder<int, SignTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(sign_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_sign(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_sign_dispatch_vectors();
+        using impl::sign_contig_dispatch_vector;
+        using impl::sign_output_typeid_vector;
+        using impl::sign_strided_dispatch_vector;
+
+        auto sign_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, sign_output_typeid_vector,
+                sign_contig_dispatch_vector, sign_strided_dispatch_vector);
+        };
+        m.def("_sign", sign_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sign_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, sign_output_typeid_vector);
+        };
+        m.def("_sign_result_type", sign_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sign.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/sign.hpp
new file mode 100644
index 000000000000..19686ada3dbf
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/sign.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_sign(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/signbit.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/signbit.cpp
new file mode 100644
index 000000000000..eeef1de50331
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/signbit.cpp
@@ -0,0 +1,128 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "signbit.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/signbit.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U41: ==== SIGNBIT   (x)
+namespace impl
+{
+
+namespace signbit_fn_ns = dpctl::tensor::kernels::signbit;
+
+static unary_contig_impl_fn_ptr_t
+    signbit_contig_dispatch_vector[td_ns::num_types];
+static int signbit_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    signbit_strided_dispatch_vector[td_ns::num_types];
+
+void populate_signbit_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = signbit_fn_ns;
+
+    using fn_ns::SignbitContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SignbitContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(signbit_contig_dispatch_vector);
+
+    using fn_ns::SignbitStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SignbitStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(signbit_strided_dispatch_vector);
+
+    using fn_ns::SignbitTypeMapFactory;
+    DispatchVectorBuilder<int, SignbitTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(signbit_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_signbit(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_signbit_dispatch_vectors();
+        using impl::signbit_contig_dispatch_vector;
+        using impl::signbit_output_typeid_vector;
+        using impl::signbit_strided_dispatch_vector;
+
+        auto signbit_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                 sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_unary_ufunc(src, dst, exec_q, depends,
+                                  signbit_output_typeid_vector,
+                                  signbit_contig_dispatch_vector,
+                                  signbit_strided_dispatch_vector);
+        };
+        m.def("_signbit", signbit_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto signbit_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              signbit_output_typeid_vector);
+        };
+        m.def("_signbit_result_type", signbit_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/signbit.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/signbit.hpp
new file mode 100644
index 000000000000..292386b174fc
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/signbit.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_signbit(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sin.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/sin.cpp
new file mode 100644
index 000000000000..7db753e27c4b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/sin.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "sin.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/sin.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U30: ==== SIN   (x)
+namespace impl
+{
+
+namespace sin_fn_ns = dpctl::tensor::kernels::sin;
+
+static unary_contig_impl_fn_ptr_t sin_contig_dispatch_vector[td_ns::num_types];
+static int sin_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    sin_strided_dispatch_vector[td_ns::num_types];
+
+void populate_sin_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = sin_fn_ns;
+
+    using fn_ns::SinContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SinContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(sin_contig_dispatch_vector);
+
+    using fn_ns::SinStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SinStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(sin_strided_dispatch_vector);
+
+    using fn_ns::SinTypeMapFactory;
+    DispatchVectorBuilder<int, SinTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(sin_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_sin(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_sin_dispatch_vectors();
+        using impl::sin_contig_dispatch_vector;
+        using impl::sin_output_typeid_vector;
+        using impl::sin_strided_dispatch_vector;
+
+        auto sin_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, sin_output_typeid_vector,
+                sin_contig_dispatch_vector, sin_strided_dispatch_vector);
+        };
+        m.def("_sin", sin_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sin_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, sin_output_typeid_vector);
+        };
+        m.def("_sin_result_type", sin_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sin.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/sin.hpp
new file mode 100644
index 000000000000..a4b3da08b7fc
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/sin.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_sin(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sinh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/sinh.cpp
new file mode 100644
index 000000000000..e56a28e0c2aa
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/sinh.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "sinh.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/sinh.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U31: ==== SINH   (x)
+namespace impl
+{
+
+namespace sinh_fn_ns = dpctl::tensor::kernels::sinh;
+
+static unary_contig_impl_fn_ptr_t sinh_contig_dispatch_vector[td_ns::num_types];
+static int sinh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    sinh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_sinh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = sinh_fn_ns;
+
+    using fn_ns::SinhContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SinhContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(sinh_contig_dispatch_vector);
+
+    using fn_ns::SinhStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SinhStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(sinh_strided_dispatch_vector);
+
+    using fn_ns::SinhTypeMapFactory;
+    DispatchVectorBuilder<int, SinhTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(sinh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_sinh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_sinh_dispatch_vectors();
+        using impl::sinh_contig_dispatch_vector;
+        using impl::sinh_output_typeid_vector;
+        using impl::sinh_strided_dispatch_vector;
+
+        auto sinh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, sinh_output_typeid_vector,
+                sinh_contig_dispatch_vector, sinh_strided_dispatch_vector);
+        };
+        m.def("_sinh", sinh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sinh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, sinh_output_typeid_vector);
+        };
+        m.def("_sinh_result_type", sinh_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sinh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/sinh.hpp
new file mode 100644
index 000000000000..4a0d90d24c8c
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/sinh.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_sinh(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.cpp
new file mode 100644
index 000000000000..a4a715147055
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "sqrt.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/sqrt.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U33: ==== SQRT   (x)
+namespace impl
+{
+
+namespace sqrt_fn_ns = dpctl::tensor::kernels::sqrt;
+
+static unary_contig_impl_fn_ptr_t sqrt_contig_dispatch_vector[td_ns::num_types];
+static int sqrt_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    sqrt_strided_dispatch_vector[td_ns::num_types];
+
+void populate_sqrt_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = sqrt_fn_ns;
+
+    using fn_ns::SqrtContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SqrtContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(sqrt_contig_dispatch_vector);
+
+    using fn_ns::SqrtStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SqrtStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(sqrt_strided_dispatch_vector);
+
+    using fn_ns::SqrtTypeMapFactory;
+    DispatchVectorBuilder<int, SqrtTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(sqrt_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_sqrt(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_sqrt_dispatch_vectors();
+        using impl::sqrt_contig_dispatch_vector;
+        using impl::sqrt_output_typeid_vector;
+        using impl::sqrt_strided_dispatch_vector;
+
+        auto sqrt_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, sqrt_output_typeid_vector,
+                sqrt_contig_dispatch_vector, sqrt_strided_dispatch_vector);
+        };
+        m.def("_sqrt", sqrt_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sqrt_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, sqrt_output_typeid_vector);
+        };
+        m.def("_sqrt_result_type", sqrt_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.hpp
new file mode 100644
index 000000000000..e8f7014c1afc
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/sqrt.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_sqrt(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/square.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/square.cpp
new file mode 100644
index 000000000000..d3e229ae42fc
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/square.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "square.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/square.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U32: ==== SQUARE   (x)
+namespace impl
+{
+
+namespace square_fn_ns = dpctl::tensor::kernels::square;
+
+static unary_contig_impl_fn_ptr_t
+    square_contig_dispatch_vector[td_ns::num_types];
+static int square_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    square_strided_dispatch_vector[td_ns::num_types];
+
+void populate_square_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = square_fn_ns;
+
+    using fn_ns::SquareContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SquareContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(square_contig_dispatch_vector);
+
+    using fn_ns::SquareStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SquareStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(square_strided_dispatch_vector);
+
+    using fn_ns::SquareTypeMapFactory;
+    DispatchVectorBuilder<int, SquareTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(square_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_square(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_square_dispatch_vectors();
+        using impl::square_contig_dispatch_vector;
+        using impl::square_output_typeid_vector;
+        using impl::square_strided_dispatch_vector;
+
+        auto square_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, square_output_typeid_vector,
+                square_contig_dispatch_vector, square_strided_dispatch_vector);
+        };
+        m.def("_square", square_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto square_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              square_output_typeid_vector);
+        };
+        m.def("_square_result_type", square_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/square.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/square.hpp
new file mode 100644
index 000000000000..3f23f184499c
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/square.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_square(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/subtract.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/subtract.cpp
new file mode 100644
index 000000000000..ec6edaa52dd5
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/subtract.cpp
@@ -0,0 +1,243 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "subtract.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/subtract.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B23: ===== SUBTRACT (x1, x2)
+namespace impl
+{
+namespace subtract_fn_ns = dpctl::tensor::kernels::subtract;
+
+static binary_contig_impl_fn_ptr_t
+    subtract_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int subtract_output_id_table[td_ns::num_types][td_ns::num_types];
+static int subtract_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    subtract_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// sub(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    subtract_contig_matrix_contig_row_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+// sub(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    subtract_contig_row_contig_matrix_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    subtract_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    subtract_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    subtract_inplace_row_matrix_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_subtract_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = subtract_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::SubtractTypeMapFactory;
+    DispatchTableBuilder<int, SubtractTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(subtract_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::SubtractStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, SubtractStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(subtract_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::SubtractContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, SubtractContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(subtract_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::SubtractContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        SubtractContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        subtract_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::SubtractContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        SubtractContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        subtract_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::SubtractInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         SubtractInplaceStridedFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(subtract_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::SubtractInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         SubtractInplaceContigFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(subtract_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::SubtractInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         SubtractInplaceRowMatrixBroadcastFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(subtract_inplace_row_matrix_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::SubtractInplaceTypeMapFactory;
+    DispatchTableBuilder<int, SubtractInplaceTypeMapFactory, num_types> dtb9;
+    dtb9.populate_dispatch_table(subtract_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_subtract(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_subtract_dispatch_tables();
+        using impl::subtract_contig_dispatch_table;
+        using impl::subtract_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::subtract_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::subtract_output_id_table;
+        using impl::subtract_strided_dispatch_table;
+
+        auto subtract_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, subtract_output_id_table,
+                // function pointers to handle operation on contiguous
+                // arrays (pointers may be nullptr)
+                subtract_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays
+                // (most general case)
+                subtract_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                subtract_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                subtract_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto subtract_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               subtract_output_id_table);
+        };
+        m.def("_subtract", subtract_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_subtract_result_type", subtract_result_type_pyapi, "");
+
+        using impl::subtract_inplace_contig_dispatch_table;
+        using impl::subtract_inplace_output_id_table;
+        using impl::subtract_inplace_row_matrix_dispatch_table;
+        using impl::subtract_inplace_strided_dispatch_table;
+
+        auto subtract_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                          sycl::queue &exec_q,
+                                          const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, subtract_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                subtract_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                subtract_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                subtract_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_subtract_inplace", subtract_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/subtract.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/subtract.hpp
new file mode 100644
index 000000000000..89cdfd6d0ea0
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/subtract.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_subtract(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/tan.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/tan.cpp
new file mode 100644
index 000000000000..8abdea0e5283
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/tan.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "tan.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/tan.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U34: ==== TAN   (x)
+namespace impl
+{
+
+namespace tan_fn_ns = dpctl::tensor::kernels::tan;
+
+static unary_contig_impl_fn_ptr_t tan_contig_dispatch_vector[td_ns::num_types];
+static int tan_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    tan_strided_dispatch_vector[td_ns::num_types];
+
+void populate_tan_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = tan_fn_ns;
+
+    using fn_ns::TanContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TanContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(tan_contig_dispatch_vector);
+
+    using fn_ns::TanStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TanStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(tan_strided_dispatch_vector);
+
+    using fn_ns::TanTypeMapFactory;
+    DispatchVectorBuilder<int, TanTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(tan_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_tan(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_tan_dispatch_vectors();
+        using impl::tan_contig_dispatch_vector;
+        using impl::tan_output_typeid_vector;
+        using impl::tan_strided_dispatch_vector;
+
+        auto tan_pyapi = [&](const arrayT &src, const arrayT &dst,
+                             sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, tan_output_typeid_vector,
+                tan_contig_dispatch_vector, tan_strided_dispatch_vector);
+        };
+        m.def("_tan", tan_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto tan_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, tan_output_typeid_vector);
+        };
+        m.def("_tan_result_type", tan_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/tan.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/tan.hpp
new file mode 100644
index 000000000000..b0818a9a85c2
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/tan.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_tan(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/tanh.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/tanh.cpp
new file mode 100644
index 000000000000..bf8ff205c0af
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/tanh.cpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "tanh.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/tanh.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U35: ==== TANH   (x)
+namespace impl
+{
+
+namespace tanh_fn_ns = dpctl::tensor::kernels::tanh;
+
+static unary_contig_impl_fn_ptr_t tanh_contig_dispatch_vector[td_ns::num_types];
+static int tanh_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    tanh_strided_dispatch_vector[td_ns::num_types];
+
+void populate_tanh_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = tanh_fn_ns;
+
+    using fn_ns::TanhContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TanhContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(tanh_contig_dispatch_vector);
+
+    using fn_ns::TanhStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TanhStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(tanh_strided_dispatch_vector);
+
+    using fn_ns::TanhTypeMapFactory;
+    DispatchVectorBuilder<int, TanhTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(tanh_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_tanh(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_tanh_dispatch_vectors();
+        using impl::tanh_contig_dispatch_vector;
+        using impl::tanh_output_typeid_vector;
+        using impl::tanh_strided_dispatch_vector;
+
+        auto tanh_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, tanh_output_typeid_vector,
+                tanh_contig_dispatch_vector, tanh_strided_dispatch_vector);
+        };
+        m.def("_tanh", tanh_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto tanh_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, tanh_output_typeid_vector);
+        };
+        m.def("_tanh_result_type", tanh_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/tanh.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/tanh.hpp
new file mode 100644
index 000000000000..d29c924d5e73
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/tanh.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_tanh(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.cpp
new file mode 100644
index 000000000000..4c1a117fbcae
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.cpp
@@ -0,0 +1,500 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <utility> // for std::ignore
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "simplify_iteration_space.hpp"
+#include "true_divide.hpp"
+
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/true_divide.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B08: ===== DIVIDE (x1, x2)
+namespace impl
+{
+namespace true_divide_fn_ns = dpctl::tensor::kernels::true_divide;
+
+static binary_contig_impl_fn_ptr_t
+    true_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int true_divide_output_id_table[td_ns::num_types][td_ns::num_types];
+static int true_divide_inplace_output_id_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    true_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// divide(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    true_divide_contig_matrix_contig_row_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+// divide(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    true_divide_contig_row_contig_matrix_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    true_divide_inplace_contig_dispatch_table[td_ns::num_types]
+                                             [td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    true_divide_inplace_strided_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    true_divide_inplace_row_matrix_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+void populate_true_divide_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = true_divide_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::TrueDivideTypeMapFactory;
+    DispatchTableBuilder<int, TrueDivideTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(true_divide_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::TrueDivideStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, TrueDivideStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(true_divide_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::TrueDivideContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, TrueDivideContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(true_divide_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::TrueDivideContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        TrueDivideContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        true_divide_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::TrueDivideContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        TrueDivideContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        true_divide_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::TrueDivideInplaceTypeMapFactory;
+    DispatchTableBuilder<int, TrueDivideInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(true_divide_inplace_output_id_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::TrueDivideInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         TrueDivideInplaceStridedFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(true_divide_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::TrueDivideInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         TrueDivideInplaceContigFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(true_divide_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::TrueDivideInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         TrueDivideInplaceRowMatrixBroadcastFactory, num_types>
+        dtb9;
+    dtb9.populate_dispatch_table(true_divide_inplace_row_matrix_dispatch_table);
+};
+
+template <typename T>
+class divide_by_scalar_krn;
+
+typedef sycl::event (*divide_by_scalar_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    py::ssize_t,
+    const char *,
+    char *,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename scalarT>
+sycl::event divide_by_scalar(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             py::ssize_t arg_offset,
+                             const char *scalar_ptr,
+                             char *res_p,
+                             py::ssize_t res_offset,
+                             const std::vector<sycl::event> &depends = {})
+{
+    const scalarT sc_v = *reinterpret_cast<const scalarT *>(scalar_ptr);
+
+    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using BinOpT =
+            dpctl::tensor::kernels::true_divide::TrueDivideFunctor<T, scalarT,
+                                                                   T>;
+
+        auto op = BinOpT();
+
+        using IndexerT =
+            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+
+        const IndexerT two_offsets_indexer{nd, arg_offset, res_offset,
+                                           shape_and_strides};
+
+        const T *arg_tp = reinterpret_cast<const T *>(arg_p);
+        T *res_tp = reinterpret_cast<T *>(res_p);
+
+        cgh.parallel_for<divide_by_scalar_krn<T>>(
+            {nelems}, [=](sycl::id<1> id) {
+                const auto &two_offsets_ =
+                    two_offsets_indexer(static_cast<ssize_t>(id.get(0)));
+
+                const auto &arg_i = two_offsets_.get_first_offset();
+                const auto &res_i = two_offsets_.get_second_offset();
+                res_tp[res_i] = op(arg_tp[arg_i], sc_v);
+            });
+    });
+    return comp_ev;
+}
+
+std::pair<sycl::event, sycl::event>
+    py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        double scalar,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends = {})
+{
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array has unexpected elemental data type.");
+    }
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+    // check shapes, broadcasting is assumed done by caller
+    // check that dimensions are the same
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != src.get_ndim()) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // check that shapes are the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; i < dst_nd; ++i) {
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    // if nelems is zero, return
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(src, dst) && !same_logical_tensors(src, dst))) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    static constexpr int float16_typeid =
+        static_cast<int>(td_ns::typenum_t::HALF);
+    static constexpr int float32_typeid =
+        static_cast<int>(td_ns::typenum_t::FLOAT);
+    static constexpr int float64_typeid =
+        static_cast<int>(td_ns::typenum_t::DOUBLE);
+    static constexpr int complex64_typeid =
+        static_cast<int>(td_ns::typenum_t::CFLOAT);
+    static constexpr int complex128_typeid =
+        static_cast<int>(td_ns::typenum_t::CDOUBLE);
+
+    // statically pre-allocated memory for scalar
+    alignas(double) char scalar_alloc[sizeof(double)] = {0};
+
+    divide_by_scalar_fn_ptr_t fn;
+    // placement new into stack memory means no call to delete is necessary
+    switch (src_typeid) {
+    case float16_typeid:
+    {
+        fn = divide_by_scalar<sycl::half, sycl::half>;
+        std::ignore =
+            new (scalar_alloc) sycl::half(static_cast<sycl::half>(scalar));
+        break;
+    }
+    case float32_typeid:
+    {
+        fn = divide_by_scalar<float, float>;
+        std::ignore = new (scalar_alloc) float(scalar);
+        break;
+    }
+    case float64_typeid:
+    {
+        fn = divide_by_scalar<double, double>;
+        std::ignore = new (scalar_alloc) double(scalar);
+        break;
+    }
+    case complex64_typeid:
+    {
+        fn = divide_by_scalar<std::complex<float>, float>;
+        std::ignore = new (scalar_alloc) float(scalar);
+        break;
+    }
+    case complex128_typeid:
+    {
+        fn = divide_by_scalar<std::complex<double>, double>;
+        std::ignore = new (scalar_alloc) double(scalar);
+        break;
+    }
+    default:
+        throw std::runtime_error("Implementation is missing for typeid=" +
+                                 std::to_string(src_typeid));
+    }
+
+    // simplify strides
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = dst_nd;
+    const py::ssize_t *shape = src_shape;
+
+    std::vector<sycl::event> host_tasks{};
+    simplify_iteration_space(nd, shape, src_strides, dst_strides,
+                             // outputs
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (nd == 0) {
+        // handle 0d array as 1d array with 1 element
+        static constexpr py::ssize_t one{1};
+        simplified_shape.push_back(one);
+        simplified_src_strides.push_back(one);
+        simplified_dst_strides.push_back(one);
+        src_offset = 0;
+        dst_offset = 0;
+    }
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_tasks, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_));
+    auto &copy_metadata_ev = std::get<2>(ptr_sz_event_triple_);
+
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    sycl::event div_ev =
+        fn(exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
+           scalar_alloc, dst_data, dst_offset, all_deps);
+
+    // async free of shape_strides temporary
+    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {div_ev}, shape_strides_owner);
+
+    host_tasks.push_back(tmp_cleanup_ev);
+
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_tasks), div_ev);
+}
+
+} // namespace impl
+
+void init_divide(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_true_divide_dispatch_tables();
+        using impl::true_divide_contig_dispatch_table;
+        using impl::
+            true_divide_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::
+            true_divide_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::true_divide_output_id_table;
+        using impl::true_divide_strided_dispatch_table;
+
+        auto divide_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                const arrayT &dst, sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, true_divide_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                true_divide_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                true_divide_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                true_divide_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                true_divide_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto divide_result_type_pyapi = [&](const py::dtype &dtype1,
+                                            const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               true_divide_output_id_table);
+        };
+        m.def("_divide", divide_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_divide_result_type", divide_result_type_pyapi, "");
+
+        using impl::true_divide_inplace_contig_dispatch_table;
+        using impl::true_divide_inplace_output_id_table;
+        using impl::true_divide_inplace_row_matrix_dispatch_table;
+        using impl::true_divide_inplace_strided_dispatch_table;
+
+        auto divide_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                        sycl::queue &exec_q,
+                                        const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, true_divide_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                true_divide_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                true_divide_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                true_divide_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_divide_inplace", divide_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+
+        using impl::py_divide_by_scalar;
+        m.def("_divide_by_scalar", &py_divide_by_scalar, "", py::arg("src"),
+              py::arg("scalar"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.hpp
new file mode 100644
index 000000000000..941384beaf8d
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/true_divide.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_divide(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/trunc.cpp b/dpnp/tensor/libtensor/source/elementwise_functions/trunc.cpp
new file mode 100644
index 000000000000..3a798d8e110d
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/trunc.cpp
@@ -0,0 +1,127 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "trunc.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/trunc.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
+
+// U36: ==== TRUNC   (x)
+namespace impl
+{
+
+namespace trunc_fn_ns = dpctl::tensor::kernels::trunc;
+
+static unary_contig_impl_fn_ptr_t
+    trunc_contig_dispatch_vector[td_ns::num_types];
+static int trunc_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    trunc_strided_dispatch_vector[td_ns::num_types];
+
+void populate_trunc_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = trunc_fn_ns;
+
+    using fn_ns::TruncContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TruncContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(trunc_contig_dispatch_vector);
+
+    using fn_ns::TruncStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TruncStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(trunc_strided_dispatch_vector);
+
+    using fn_ns::TruncTypeMapFactory;
+    DispatchVectorBuilder<int, TruncTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(trunc_output_typeid_vector);
+};
+
+} // namespace impl
+
+void init_trunc(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_trunc_dispatch_vectors();
+        using impl::trunc_contig_dispatch_vector;
+        using impl::trunc_output_typeid_vector;
+        using impl::trunc_strided_dispatch_vector;
+
+        auto trunc_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, trunc_output_typeid_vector,
+                trunc_contig_dispatch_vector, trunc_strided_dispatch_vector);
+        };
+        m.def("_trunc", trunc_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto trunc_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              trunc_output_typeid_vector);
+        };
+        m.def("_trunc_result_type", trunc_result_type_pyapi);
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/elementwise_functions/trunc.hpp b/dpnp/tensor/libtensor/source/elementwise_functions/trunc.hpp
new file mode 100644
index 000000000000..79ed6b5ded14
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/elementwise_functions/trunc.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_trunc(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/eye_ctor.cpp b/dpnp/tensor/libtensor/source/eye_ctor.cpp
new file mode 100644
index 000000000000..025a7d58d06e
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/eye_ctor.cpp
@@ -0,0 +1,142 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "eye_ctor.hpp"
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+using dpctl::tensor::kernels::constructors::eye_fn_ptr_t;
+static eye_fn_ptr_t eye_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_eye(py::ssize_t k,
+                    const dpctl::tensor::usm_ndarray &dst,
+                    sycl::queue &exec_q,
+                    const std::vector<sycl::event> &depends)
+{
+    // dst must be 2D
+
+    if (dst.get_ndim() != 2) {
+        throw py::value_error(
+            "usm_ndarray_eye: Expecting 2D array to populate");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error("Execution queue is not compatible with the "
+                              "allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    const py::ssize_t nelem = dst.get_size();
+    const py::ssize_t rows = dst.get_shape(0);
+    const py::ssize_t cols = dst.get_shape(1);
+    if (rows == 0 || cols == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+    if (!is_dst_c_contig && !is_dst_f_contig) {
+        throw py::value_error("USM array is not contiguous");
+    }
+
+    py::ssize_t start;
+    if (is_dst_c_contig) {
+        start = (k < 0) ? -k * cols : k;
+    }
+    else {
+        start = (k < 0) ? -k : k * rows;
+    }
+
+    const py::ssize_t *strides = dst.get_strides_raw();
+    py::ssize_t step;
+    if (strides == nullptr) {
+        step = (is_dst_c_contig) ? cols + 1 : rows + 1;
+    }
+    else {
+        step = strides[0] + strides[1];
+    }
+
+    const py::ssize_t length = std::min({rows, cols, rows + k, cols - k});
+    const py::ssize_t end = start + step * (length - 1);
+
+    char *dst_data = dst.get_data();
+    sycl::event eye_event;
+
+    auto fn = eye_dispatch_vector[dst_typeid];
+
+    eye_event = fn(exec_q, static_cast<std::size_t>(nelem), start, end, step,
+                   dst_data, depends);
+
+    return std::make_pair(keep_args_alive(exec_q, {dst}, {eye_event}),
+                          eye_event);
+}
+
+void init_eye_ctor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::constructors::EyeFactory;
+
+    DispatchVectorBuilder<eye_fn_ptr_t, EyeFactory, num_types> dvb;
+    dvb.populate_dispatch_vector(eye_dispatch_vector);
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/eye_ctor.hpp b/dpnp/tensor/libtensor/source/eye_ctor.hpp
new file mode 100644
index 000000000000..dda7f2c4813a
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/eye_ctor.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_eye(py::ssize_t k,
+                    const dpctl::tensor::usm_ndarray &dst,
+                    sycl::queue &exec_q,
+                    const std::vector<sycl::event> &depends = {});
+
+extern void init_eye_ctor_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/full_ctor.cpp b/dpnp/tensor/libtensor/source/full_ctor.cpp
new file mode 100644
index 000000000000..8d7fcd22b914
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/full_ctor.cpp
@@ -0,0 +1,309 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h> // py::cast<std::complex<T>>
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "full_ctor.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &,
+                                            std::size_t,
+                                            const py::object &,
+                                            char *,
+                                            const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param py_value  Python object representing the value to fill the array with.
+ * Must be convertible to `dstTy`.
+ * @param dst_p  Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_contig_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             const py::object &py_value,
+                             char *dst_p,
+                             const std::vector<sycl::event> &depends)
+{
+    dstTy fill_v = py::cast<dstTy>(py_value);
+
+    sycl::event fill_ev;
+
+    if constexpr (sizeof(dstTy) == sizeof(char)) {
+        const auto memset_val = sycl::bit_cast<unsigned char>(fill_v);
+        fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                       nelems * sizeof(dstTy));
+        });
+    }
+    else {
+        bool is_zero = false;
+        if constexpr (sizeof(dstTy) == 1) {
+            is_zero = (std::uint8_t{0} == sycl::bit_cast<std::uint8_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 2) {
+            is_zero =
+                (std::uint16_t{0} == sycl::bit_cast<std::uint16_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 4) {
+            is_zero =
+                (std::uint32_t{0} == sycl::bit_cast<std::uint32_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 8) {
+            is_zero =
+                (std::uint64_t{0} == sycl::bit_cast<std::uint64_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 16) {
+            struct UInt128
+            {
+
+                constexpr UInt128() : v1{}, v2{} {}
+                UInt128(const UInt128 &) = default;
+
+                operator bool() const { return bool(!v1) && bool(!v2); }
+
+                std::uint64_t v1;
+                std::uint64_t v2;
+            };
+            is_zero = static_cast<bool>(sycl::bit_cast<UInt128>(fill_v));
+        }
+
+        if (is_zero) {
+            static constexpr int memset_val = 0;
+            fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+
+                cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                           nelems * sizeof(dstTy));
+            });
+        }
+        else {
+            using dpctl::tensor::kernels::constructors::full_contig_impl;
+
+            fill_ev =
+                full_contig_impl<dstTy>(exec_q, nelems, fill_v, dst_p, depends);
+        }
+    }
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty>
+struct FullContigFactory
+{
+    fnT get()
+    {
+        fnT f = full_contig_impl<Ty>;
+        return f;
+    }
+};
+
+typedef sycl::event (*full_strided_fn_ptr_t)(sycl::queue &,
+                                             int,
+                                             std::size_t,
+                                             py::ssize_t *,
+                                             const py::object &,
+                                             char *,
+                                             const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given strided memory allocation
+ * with specified value.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nd  Array dimensionality
+ * @param nelems  Length of the sequence
+ * @param shape_strides  Kernel accessible USM pointer to packed shape and
+ * strides of array.
+ * @param py_value  Python object representing the value to fill the array with.
+ * Must be convertible to `dstTy`.
+ * @param dst_p  Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event full_strided_impl(sycl::queue &exec_q,
+                              int nd,
+                              std::size_t nelems,
+                              py::ssize_t *shape_strides,
+                              const py::object &py_value,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+    dstTy fill_v = py::cast<dstTy>(py_value);
+
+    using dpctl::tensor::kernels::constructors::full_strided_impl;
+    sycl::event fill_ev = full_strided_impl<dstTy>(
+        exec_q, nd, nelems, shape_strides, fill_v, dst_p, depends);
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty>
+struct FullStridedFactory
+{
+    fnT get()
+    {
+        fnT f = full_strided_impl<Ty>;
+        return f;
+    }
+};
+
+static full_contig_fn_ptr_t full_contig_dispatch_vector[td_ns::num_types];
+static full_strided_fn_ptr_t full_strided_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_full(const py::object &py_value,
+                     const dpctl::tensor::usm_ndarray &dst,
+                     sycl::queue &exec_q,
+                     const std::vector<sycl::event> &depends)
+{
+    // py_value should be coercible into data type of dst
+
+    py::ssize_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *dst_data = dst.get_data();
+
+    if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
+        auto fn = full_contig_dispatch_vector[dst_typeid];
+
+        sycl::event full_contig_event =
+            fn(exec_q, static_cast<std::size_t>(dst_nelems), py_value, dst_data,
+               depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {dst}, {full_contig_event}),
+            full_contig_event);
+    }
+    else {
+        int nd = dst.get_ndim();
+        auto const &dst_shape = dst.get_shape_vector();
+        auto const &dst_strides = dst.get_strides_vector();
+
+        auto fn = full_strided_dispatch_vector[dst_typeid];
+
+        std::vector<sycl::event> host_task_events;
+        host_task_events.reserve(2);
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, dst_shape, dst_strides);
+        auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+        const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+        py::ssize_t *shape_strides = shape_strides_owner.get();
+
+        const sycl::event &full_strided_ev =
+            fn(exec_q, nd, dst_nelems, shape_strides, py_value, dst_data,
+               {copy_shape_ev});
+
+        // free shape_strides
+        const auto &temporaries_cleanup_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {full_strided_ev}, shape_strides_owner);
+        host_task_events.push_back(temporaries_cleanup_ev);
+
+        return std::make_pair(keep_args_alive(exec_q, {dst}, host_task_events),
+                              full_strided_ev);
+    }
+}
+
+void init_full_ctor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<full_contig_fn_ptr_t, FullContigFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(full_contig_dispatch_vector);
+
+    DispatchVectorBuilder<full_strided_fn_ptr_t, FullStridedFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(full_strided_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/full_ctor.hpp b/dpnp/tensor/libtensor/source/full_ctor.hpp
new file mode 100644
index 000000000000..18c15de87a40
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/full_ctor.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_full(const py::object &py_value,
+                     const dpctl::tensor::usm_ndarray &dst,
+                     sycl::queue &exec_q,
+                     const std::vector<sycl::event> &depends = {});
+
+extern void init_full_ctor_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpnp/tensor/libtensor/source/integer_advanced_indexing.cpp
new file mode 100644
index 000000000000..c6021bdfd2d1
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -0,0 +1,814 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines implementation functions of dpctl.tensor.take and
+/// dpctl.tensor.put
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/integer_advanced_indexing.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "integer_advanced_indexing.hpp"
+
+#define INDEXING_MODES 2
+#define WRAP_MODE      0
+#define CLIP_MODE      1
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::indexing::put_fn_ptr_t;
+using dpctl::tensor::kernels::indexing::take_fn_ptr_t;
+
+static take_fn_ptr_t take_dispatch_table[INDEXING_MODES][td_ns::num_types]
+                                        [td_ns::num_types];
+
+static put_fn_ptr_t put_dispatch_table[INDEXING_MODES][td_ns::num_types]
+                                      [td_ns::num_types];
+
+namespace py = pybind11;
+
+using dpctl::utils::keep_args_alive;
+
+std::vector<sycl::event>
+    _populate_kernel_params(sycl::queue &exec_q,
+                            std::vector<sycl::event> &host_task_events,
+                            char **device_ind_ptrs,
+                            py::ssize_t *device_ind_sh_st,
+                            py::ssize_t *device_ind_offsets,
+                            py::ssize_t *device_orthog_sh_st,
+                            py::ssize_t *device_along_sh_st,
+                            const py::ssize_t *inp_shape,
+                            const py::ssize_t *arr_shape,
+                            std::vector<py::ssize_t> &inp_strides,
+                            std::vector<py::ssize_t> &arr_strides,
+                            std::vector<py::ssize_t> &ind_sh_sts,
+                            std::vector<char *> &ind_ptrs,
+                            std::vector<py::ssize_t> &ind_offsets,
+                            int axis_start,
+                            int k,
+                            int ind_nd,
+                            int inp_nd,
+                            int orthog_sh_elems,
+                            int ind_sh_elems)
+{
+
+    using usm_host_allocator_T =
+        dpctl::tensor::alloc_utils::usm_host_allocator<char *>;
+    using ptrT = std::vector<char *, usm_host_allocator_T>;
+
+    usm_host_allocator_T ptr_allocator(exec_q);
+    std::shared_ptr<ptrT> host_ind_ptrs_shp =
+        std::make_shared<ptrT>(k, ptr_allocator);
+
+    using usm_host_allocatorT =
+        dpctl::tensor::alloc_utils::usm_host_allocator<py::ssize_t>;
+    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT sz_allocator(exec_q);
+    std::shared_ptr<shT> host_ind_sh_st_shp =
+        std::make_shared<shT>(ind_sh_elems * (k + 1), sz_allocator);
+
+    std::shared_ptr<shT> host_ind_offsets_shp =
+        std::make_shared<shT>(k, sz_allocator);
+
+    std::shared_ptr<shT> host_orthog_sh_st_shp =
+        std::make_shared<shT>(3 * orthog_sh_elems, sz_allocator);
+
+    std::shared_ptr<shT> host_along_sh_st_shp =
+        std::make_shared<shT>(2 * (k + ind_sh_elems), sz_allocator);
+
+    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
+              host_ind_sh_st_shp->begin());
+    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
+    std::copy(ind_offsets.begin(), ind_offsets.end(),
+              host_ind_offsets_shp->begin());
+
+    const sycl::event &device_ind_ptrs_copy_ev = exec_q.copy<char *>(
+        host_ind_ptrs_shp->data(), device_ind_ptrs, host_ind_ptrs_shp->size());
+
+    const sycl::event &device_ind_sh_st_copy_ev =
+        exec_q.copy<py::ssize_t>(host_ind_sh_st_shp->data(), device_ind_sh_st,
+                                 host_ind_sh_st_shp->size());
+
+    const sycl::event &device_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
+        host_ind_offsets_shp->data(), device_ind_offsets,
+        host_ind_offsets_shp->size());
+
+    int orthog_nd = inp_nd - k;
+
+    if (orthog_nd > 0) {
+        if (axis_start > 0) {
+            std::copy(inp_shape, inp_shape + axis_start,
+                      host_orthog_sh_st_shp->begin());
+            std::copy(inp_strides.begin(), inp_strides.begin() + axis_start,
+                      host_orthog_sh_st_shp->begin() + orthog_sh_elems);
+            std::copy(arr_strides.begin(), arr_strides.begin() + axis_start,
+                      host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems);
+        }
+        if (inp_nd > (axis_start + k)) {
+            std::copy(inp_shape + axis_start + k, inp_shape + inp_nd,
+                      host_orthog_sh_st_shp->begin() + axis_start);
+            std::copy(inp_strides.begin() + axis_start + k, inp_strides.end(),
+                      host_orthog_sh_st_shp->begin() + orthog_sh_elems +
+                          axis_start);
+
+            std::copy(arr_strides.begin() + axis_start + ind_nd,
+                      arr_strides.end(),
+                      host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems +
+                          axis_start);
+        }
+    }
+
+    if (inp_nd > 0) {
+        std::copy(inp_shape + axis_start, inp_shape + axis_start + k,
+                  host_along_sh_st_shp->begin());
+
+        std::copy(inp_strides.begin() + axis_start,
+                  inp_strides.begin() + axis_start + k,
+                  host_along_sh_st_shp->begin() + k);
+    }
+
+    if (ind_nd > 0) {
+        std::copy(arr_shape + axis_start, arr_shape + axis_start + ind_nd,
+                  host_along_sh_st_shp->begin() + 2 * k);
+        std::copy(arr_strides.begin() + axis_start,
+                  arr_strides.begin() + axis_start + ind_nd,
+                  host_along_sh_st_shp->begin() + 2 * k + ind_nd);
+    }
+
+    const sycl::event &device_orthog_sh_st_copy_ev = exec_q.copy<py::ssize_t>(
+        host_orthog_sh_st_shp->data(), device_orthog_sh_st,
+        host_orthog_sh_st_shp->size());
+
+    const sycl::event &device_along_sh_st_copy_ev = exec_q.copy<py::ssize_t>(
+        host_along_sh_st_shp->data(), device_along_sh_st,
+        host_along_sh_st_shp->size());
+
+    const sycl::event &shared_ptr_cleanup_ev =
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on({device_along_sh_st_copy_ev,
+                            device_orthog_sh_st_copy_ev,
+                            device_ind_offsets_copy_ev,
+                            device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev});
+            cgh.host_task(
+                [host_ind_offsets_shp = std::move(host_ind_offsets_shp),
+                 host_ind_sh_st_shp = std::move(host_ind_sh_st_shp),
+                 host_ind_ptrs_shp = std::move(host_ind_ptrs_shp),
+                 host_orthog_sh_st_shp = std::move(host_orthog_sh_st_shp),
+                 host_along_sh_st_shp = std::move(host_along_sh_st_shp)] {});
+        });
+    host_task_events.push_back(shared_ptr_cleanup_ev);
+
+    std::vector<sycl::event> sh_st_pack_deps{
+        device_ind_ptrs_copy_ev, device_ind_sh_st_copy_ev,
+        device_ind_offsets_copy_ev, device_orthog_sh_st_copy_ev,
+        device_along_sh_st_copy_ev};
+    return sh_st_pack_deps;
+}
+
+/* Utility to parse python object py_ind into vector of `usm_ndarray`s */
+std::vector<dpctl::tensor::usm_ndarray> parse_py_ind(const sycl::queue &q,
+                                                     const py::object &py_ind)
+{
+    std::size_t ind_count = py::len(py_ind);
+    std::vector<dpctl::tensor::usm_ndarray> res;
+    res.reserve(ind_count);
+
+    bool nd_is_known = false;
+    int nd = -1;
+    for (std::size_t i = 0; i < ind_count; ++i) {
+        py::object el_i = py_ind[py::cast(i)];
+        dpctl::tensor::usm_ndarray arr_i =
+            py::cast<dpctl::tensor::usm_ndarray>(el_i);
+        if (!dpctl::utils::queues_are_compatible(q, {arr_i})) {
+            throw py::value_error("Index allocation queue is not compatible "
+                                  "with execution queue");
+        }
+        if (nd_is_known) {
+            if (nd != arr_i.get_ndim()) {
+                throw py::value_error(
+                    "Indices must have the same number of dimensions.");
+            }
+        }
+        else {
+            nd_is_known = true;
+            nd = arr_i.get_ndim();
+        }
+        res.push_back(arr_i);
+    }
+
+    return res;
+}
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_take(const dpctl::tensor::usm_ndarray &src,
+                     const py::object &py_ind,
+                     const dpctl::tensor::usm_ndarray &dst,
+                     int axis_start,
+                     std::uint8_t mode,
+                     sycl::queue &exec_q,
+                     const std::vector<sycl::event> &depends)
+{
+    std::vector<dpctl::tensor::usm_ndarray> ind = parse_py_ind(exec_q, py_ind);
+
+    int k = ind.size();
+
+    if (k == 0) {
+        throw py::value_error("List of indices is empty.");
+    }
+
+    if (axis_start < 0) {
+        throw py::value_error("Axis cannot be negative.");
+    }
+
+    if (mode != 0 && mode != 1) {
+        throw py::value_error("Mode must be 0 or 1.");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    const dpctl::tensor::usm_ndarray ind_rep = ind[0];
+
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    int ind_nd = ind_rep.get_ndim();
+
+    auto sh_elems = std::max<int>(src_nd, 1);
+
+    if (axis_start + k > sh_elems) {
+        throw py::value_error("Axes are out of range for array of dimension " +
+                              std::to_string(src_nd));
+    }
+    if (src_nd == 0) {
+        if (dst_nd != ind_nd) {
+            throw py::value_error(
+                "Destination is not of appropriate dimension for take kernel.");
+        }
+    }
+    else {
+        if (dst_nd != (src_nd - k + ind_nd)) {
+            throw py::value_error(
+                "Destination is not of appropriate dimension for take kernel.");
+        }
+    }
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool orthog_shapes_equal(true);
+    std::size_t orthog_nelems(1);
+    for (int i = 0; i < (src_nd - k); ++i) {
+        auto idx1 = (i < axis_start) ? i : i + k;
+        auto idx2 = (i < axis_start) ? i : i + ind_nd;
+
+        orthog_nelems *= static_cast<std::size_t>(src_shape[idx1]);
+        orthog_shapes_equal =
+            orthog_shapes_equal && (src_shape[idx1] == dst_shape[idx2]);
+    }
+
+    if (!orthog_shapes_equal) {
+        throw py::value_error(
+            "Axes of basic indices are not of matching shapes.");
+    }
+
+    if (orthog_nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Array memory overlap.");
+    }
+
+    py::ssize_t src_offset = py::ssize_t(0);
+    py::ssize_t dst_offset = py::ssize_t(0);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::type_error("Array data types are not the same.");
+    }
+
+    const py::ssize_t *ind_shape = ind_rep.get_shape_raw();
+
+    int ind_typenum = ind_rep.get_typenum();
+    int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum);
+
+    std::size_t ind_nelems(1);
+    for (int i = 0; i < ind_nd; ++i) {
+        ind_nelems *= static_cast<std::size_t>(ind_shape[i]);
+
+        if (!(ind_shape[i] == dst_shape[axis_start + i])) {
+            throw py::value_error(
+                "Indices shape does not match shape of axis in destination.");
+        }
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, orthog_nelems * ind_nelems);
+
+    int ind_sh_elems = std::max<int>(ind_nd, 1);
+
+    std::vector<char *> ind_ptrs;
+    ind_ptrs.reserve(k);
+
+    std::vector<py::ssize_t> ind_offsets;
+    ind_offsets.reserve(k);
+
+    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, 0);
+    if (ind_nd > 0) {
+        std::copy(ind_shape, ind_shape + ind_nd, ind_sh_sts.begin());
+    }
+    for (int i = 0; i < k; ++i) {
+        dpctl::tensor::usm_ndarray ind_ = ind[i];
+
+        if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) {
+            throw py::value_error(
+                "Execution queue is not compatible with allocation queues");
+        }
+
+        // ndim, type, and shape are checked against the first array
+        if (i > 0) {
+            if (!(ind_.get_ndim() == ind_nd)) {
+                throw py::value_error("Index dimensions are not the same");
+            }
+
+            if (!(ind_type_id ==
+                  array_types.typenum_to_lookup_id(ind_.get_typenum()))) {
+                throw py::type_error(
+                    "Indices array data types are not all the same.");
+            }
+
+            const py::ssize_t *ind_shape_ = ind_.get_shape_raw();
+            for (int dim = 0; dim < ind_nd; ++dim) {
+                if (!(ind_shape[dim] == ind_shape_[dim])) {
+                    throw py::value_error("Indices shapes are not all equal.");
+                }
+            }
+        }
+
+        // check for overlap with destination
+        if (overlap(dst, ind_)) {
+            throw py::value_error(
+                "Arrays index overlapping segments of memory");
+        }
+
+        char *ind_data = ind_.get_data();
+
+        // strides are initialized to 0 for 0D indices, so skip here
+        if (ind_nd > 0) {
+            auto ind_strides = ind_.get_strides_vector();
+            std::copy(ind_strides.begin(), ind_strides.end(),
+                      ind_sh_sts.begin() + (i + 1) * ind_nd);
+        }
+
+        ind_ptrs.push_back(ind_data);
+        ind_offsets.push_back(py::ssize_t(0));
+    }
+
+    if (ind_nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto packed_ind_ptrs_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<char *>(k, exec_q);
+    char **packed_ind_ptrs = packed_ind_ptrs_owner.get();
+
+    // rearrange to past where indices shapes are checked
+    // packed_ind_shapes_strides = [ind_shape,
+    //                              ind[0] strides,
+    //                              ...,
+    //                              ind[k] strides]
+    auto packed_ind_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            (k + 1) * ind_sh_elems, exec_q);
+    py::ssize_t *packed_ind_shapes_strides =
+        packed_ind_shapes_strides_owner.get();
+
+    auto packed_ind_offsets_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(k, exec_q);
+    py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get();
+
+    int orthog_sh_elems = std::max<int>(src_nd - k, 1);
+
+    // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:],
+    //                          src_strides[:axis] + src_strides[axis+k:],
+    //                          dst_strides[:axis] +
+    //                          dst_strides[axis+ind.ndim:]]
+    auto packed_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            3 * orthog_sh_elems, exec_q);
+    py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get();
+
+    // packed_axes_shapes_strides = [src_shape[axis:axis+k],
+    //                               src_strides[axis:axis+k],
+    //                               dst_shape[axis:axis+ind.ndim],
+    //                               dst_strides[axis:axis+ind.ndim]]
+    auto packed_axes_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            2 * (k + ind_sh_elems), exec_q);
+    py::ssize_t *packed_axes_shapes_strides =
+        packed_axes_shapes_strides_owner.get();
+
+    auto src_strides = src.get_strides_vector();
+    auto dst_strides = dst.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    std::vector<sycl::event> pack_deps = _populate_kernel_params(
+        exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides,
+        packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides,
+        src_shape, dst_shape, src_strides, dst_strides, ind_sh_sts, ind_ptrs,
+        ind_offsets, axis_start, k, ind_nd, src_nd, orthog_sh_elems,
+        ind_sh_elems);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + pack_deps.size());
+    all_deps.insert(std::end(all_deps), std::begin(pack_deps),
+                    std::end(pack_deps));
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    auto fn = take_dispatch_table[mode][src_type_id][ind_type_id];
+
+    if (fn == nullptr) {
+        sycl::event::wait(host_task_events);
+        throw std::runtime_error("Indices must be integer type, got " +
+                                 std::to_string(ind_type_id));
+    }
+
+    sycl::event take_generic_ev =
+        fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k,
+           packed_shapes_strides, packed_axes_shapes_strides,
+           packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs,
+           src_offset, dst_offset, packed_ind_offsets, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {take_generic_ev}, packed_shapes_strides_owner,
+            packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner,
+            packed_ind_ptrs_owner, packed_ind_offsets_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, take_generic_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst,
+                    const py::object &py_ind,
+                    const dpctl::tensor::usm_ndarray &val,
+                    int axis_start,
+                    std::uint8_t mode,
+                    sycl::queue &exec_q,
+                    const std::vector<sycl::event> &depends)
+{
+    std::vector<dpctl::tensor::usm_ndarray> ind = parse_py_ind(exec_q, py_ind);
+    int k = ind.size();
+
+    if (k == 0) {
+        // no indices to write to
+        throw py::value_error("List of indices is empty.");
+    }
+
+    if (axis_start < 0) {
+        throw py::value_error("Axis cannot be negative.");
+    }
+
+    if (mode != 0 && mode != 1) {
+        throw py::value_error("Mode must be 0 or 1.");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    const dpctl::tensor::usm_ndarray ind_rep = ind[0];
+
+    int dst_nd = dst.get_ndim();
+    int val_nd = val.get_ndim();
+    int ind_nd = ind_rep.get_ndim();
+
+    auto sh_elems = std::max<int>(dst_nd, 1);
+
+    if (axis_start + k > sh_elems) {
+        throw py::value_error("Axes are out of range for array of dimension " +
+                              std::to_string(dst_nd));
+    }
+    if (dst_nd == 0) {
+        if (val_nd != ind_nd) {
+            throw py::value_error("Destination is not of appropriate dimension "
+                                  "for put function.");
+        }
+    }
+    else {
+        if (val_nd != (dst_nd - k + ind_nd)) {
+            throw py::value_error("Destination is not of appropriate dimension "
+                                  "for put function.");
+        }
+    }
+
+    std::size_t dst_nelems = dst.get_size();
+
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    const py::ssize_t *val_shape = val.get_shape_raw();
+
+    bool orthog_shapes_equal(true);
+    std::size_t orthog_nelems(1);
+    for (int i = 0; i < (dst_nd - k); ++i) {
+        auto idx1 = (i < axis_start) ? i : i + k;
+        auto idx2 = (i < axis_start) ? i : i + ind_nd;
+
+        orthog_nelems *= static_cast<std::size_t>(dst_shape[idx1]);
+        orthog_shapes_equal =
+            orthog_shapes_equal && (dst_shape[idx1] == val_shape[idx2]);
+    }
+
+    if (!orthog_shapes_equal) {
+        throw py::value_error(
+            "Axes of basic indices are not of matching shapes.");
+    }
+
+    if (orthog_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    char *dst_data = dst.get_data();
+    char *val_data = val.get_data();
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst, val})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(val, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    py::ssize_t dst_offset = py::ssize_t(0);
+    py::ssize_t val_offset = py::ssize_t(0);
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    int dst_typenum = dst.get_typenum();
+    int val_typenum = val.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+    int val_type_id = array_types.typenum_to_lookup_id(val_typenum);
+
+    if (dst_type_id != val_type_id) {
+        throw py::type_error("Array data types are not the same.");
+    }
+
+    const py::ssize_t *ind_shape = ind_rep.get_shape_raw();
+
+    int ind_typenum = ind_rep.get_typenum();
+    int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum);
+
+    std::size_t ind_nelems(1);
+    for (int i = 0; i < ind_nd; ++i) {
+        ind_nelems *= static_cast<std::size_t>(ind_shape[i]);
+
+        if (!(ind_shape[i] == val_shape[axis_start + i])) {
+            throw py::value_error(
+                "Indices shapes does not match shape of axis in vals.");
+        }
+    }
+
+    auto ind_sh_elems = std::max<int>(ind_nd, 1);
+
+    std::vector<char *> ind_ptrs;
+    ind_ptrs.reserve(k);
+    std::vector<py::ssize_t> ind_offsets;
+    ind_offsets.reserve(k);
+    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0));
+    if (ind_nd > 0) {
+        std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin());
+    }
+    for (int i = 0; i < k; ++i) {
+        dpctl::tensor::usm_ndarray ind_ = ind[i];
+
+        if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) {
+            throw py::value_error(
+                "Execution queue is not compatible with allocation queues");
+        }
+
+        // ndim, type, and shape are checked against the first array
+        if (i > 0) {
+            if (!(ind_.get_ndim() == ind_nd)) {
+                throw py::value_error("Index dimensions are not the same");
+            }
+
+            if (!(ind_type_id ==
+                  array_types.typenum_to_lookup_id(ind_.get_typenum()))) {
+                throw py::type_error(
+                    "Indices array data types are not all the same.");
+            }
+
+            const py::ssize_t *ind_shape_ = ind_.get_shape_raw();
+            for (int dim = 0; dim < ind_nd; ++dim) {
+                if (!(ind_shape[dim] == ind_shape_[dim])) {
+                    throw py::value_error("Indices shapes are not all equal.");
+                }
+            }
+        }
+
+        // check for overlap with destination
+        if (overlap(ind_, dst)) {
+            throw py::value_error(
+                "Arrays index overlapping segments of memory");
+        }
+
+        char *ind_data = ind_.get_data();
+
+        // strides are initialized to 0 for 0D indices, so skip here
+        if (ind_nd > 0) {
+            auto ind_strides = ind_.get_strides_vector();
+            std::copy(ind_strides.begin(), ind_strides.end(),
+                      ind_sh_sts.begin() + (i + 1) * ind_nd);
+        }
+
+        ind_ptrs.push_back(ind_data);
+        ind_offsets.push_back(py::ssize_t(0));
+    }
+
+    if (ind_nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto packed_ind_ptrs_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<char *>(k, exec_q);
+    char **packed_ind_ptrs = packed_ind_ptrs_owner.get();
+
+    // packed_ind_shapes_strides = [ind_shape,
+    //                              ind[0] strides,
+    //                              ...,
+    //                              ind[k] strides]
+    auto packed_ind_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            (k + 1) * ind_sh_elems, exec_q);
+    py::ssize_t *packed_ind_shapes_strides =
+        packed_ind_shapes_strides_owner.get();
+
+    auto packed_ind_offsets_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(k, exec_q);
+    py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get();
+
+    int orthog_sh_elems = std::max<int>(dst_nd - k, 1);
+
+    // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:],
+    //                          dst_strides[:axis] + dst_strides[axis+k:],
+    //                          val_strides[:axis] +
+    //                          val_strides[axis+ind.ndim:]]
+    auto packed_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            3 * orthog_sh_elems, exec_q);
+    py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get();
+
+    // packed_axes_shapes_strides = [dst_shape[axis:axis+k],
+    //                               dst_strides[axis:axis+k],
+    //                               val_shape[axis:axis+ind.ndim],
+    //                               val_strides[axis:axis+ind.ndim]]
+    auto packed_axes_shapes_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
+            2 * (k + ind_sh_elems), exec_q);
+    py::ssize_t *packed_axes_shapes_strides =
+        packed_axes_shapes_strides_owner.get();
+
+    auto dst_strides = dst.get_strides_vector();
+    auto val_strides = val.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    std::vector<sycl::event> pack_deps = _populate_kernel_params(
+        exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides,
+        packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides,
+        dst_shape, val_shape, dst_strides, val_strides, ind_sh_sts, ind_ptrs,
+        ind_offsets, axis_start, k, ind_nd, dst_nd, orthog_sh_elems,
+        ind_sh_elems);
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + pack_deps.size());
+    all_deps.insert(std::end(all_deps), std::begin(pack_deps),
+                    std::end(pack_deps));
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id];
+
+    if (fn == nullptr) {
+        sycl::event::wait(host_task_events);
+        throw std::runtime_error("Indices must be integer type, got " +
+                                 std::to_string(ind_type_id));
+    }
+
+    sycl::event put_generic_ev =
+        fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k,
+           packed_shapes_strides, packed_axes_shapes_strides,
+           packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs,
+           dst_offset, val_offset, packed_ind_offsets, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {put_generic_ev}, packed_shapes_strides_owner,
+            packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner,
+            packed_ind_ptrs_owner, packed_ind_offsets_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, put_generic_ev);
+}
+
+void init_advanced_indexing_dispatch_tables(void)
+{
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::indexing::TakeClipFactory;
+    DispatchTableBuilder<take_fn_ptr_t, TakeClipFactory, num_types>
+        dtb_takeclip;
+    dtb_takeclip.populate_dispatch_table(take_dispatch_table[CLIP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::TakeWrapFactory;
+    DispatchTableBuilder<take_fn_ptr_t, TakeWrapFactory, num_types>
+        dtb_takewrap;
+    dtb_takewrap.populate_dispatch_table(take_dispatch_table[WRAP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::PutClipFactory;
+    DispatchTableBuilder<put_fn_ptr_t, PutClipFactory, num_types> dtb_putclip;
+    dtb_putclip.populate_dispatch_table(put_dispatch_table[CLIP_MODE]);
+
+    using dpctl::tensor::kernels::indexing::PutWrapFactory;
+    DispatchTableBuilder<put_fn_ptr_t, PutWrapFactory, num_types> dtb_putwrap;
+    dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpnp/tensor/libtensor/source/integer_advanced_indexing.hpp
new file mode 100644
index 000000000000..bc0136288e1c
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/integer_advanced_indexing.hpp
@@ -0,0 +1,71 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares Python API for implementation functions of
+/// dpctl.tensor.take and dpctl.tensor.put
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_take(const dpctl::tensor::usm_ndarray &,
+                     const py::object &,
+                     const dpctl::tensor::usm_ndarray &,
+                     int,
+                     std::uint8_t,
+                     sycl::queue &,
+                     const std::vector<sycl::event> & = {});
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_put(const dpctl::tensor::usm_ndarray &,
+                    const py::object &,
+                    const dpctl::tensor::usm_ndarray &,
+                    int,
+                    std::uint8_t,
+                    sycl::queue &,
+                    const std::vector<sycl::event> & = {});
+
+extern void init_advanced_indexing_dispatch_tables(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp b/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp
new file mode 100644
index 000000000000..9621ebc3277f
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/linalg_functions/dot.cpp
@@ -0,0 +1,834 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <exception>
+#include <iterator>
+#include <stdexcept>
+#include <string>
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dot.hpp"
+#include "dot_atomic_support.hpp"
+#include "dot_dispatch.hpp"
+#include "elementwise_functions/elementwise_functions_type_utils.hpp"
+#include "kernels/linalg_functions/dot_product.hpp"
+#include "kernels/linalg_functions/gemm.hpp"
+#include "reductions/reduction_atomic_support.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+static int dot_output_id_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::dot_product_impl_fn_ptr_t;
+static dot_product_impl_fn_ptr_t dot_product_dispatch_table[td_ns::num_types]
+                                                           [td_ns::num_types];
+
+static dot_product_impl_fn_ptr_t
+    dot_product_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::dot_product_contig_impl_fn_ptr_t;
+static dot_product_contig_impl_fn_ptr_t
+    dot_product_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static dot_product_contig_impl_fn_ptr_t
+    dot_product_contig_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::gemm_impl_fn_ptr_t;
+static gemm_impl_fn_ptr_t gemm_atomic_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+static gemm_impl_fn_ptr_t gemm_temps_dispatch_table[td_ns::num_types]
+                                                   [td_ns::num_types];
+
+using dpctl::tensor::kernels::gemm_contig_impl_fn_ptr_t;
+static gemm_contig_impl_fn_ptr_t
+    gemm_contig_atomic_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static gemm_contig_impl_fn_ptr_t
+    gemm_contig_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::gemm_batch_impl_fn_ptr_t;
+static gemm_batch_impl_fn_ptr_t
+    gemm_batch_atomic_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static gemm_batch_impl_fn_ptr_t
+    gemm_batch_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::tensor::kernels::gemm_batch_contig_impl_fn_ptr_t;
+static gemm_batch_contig_impl_fn_ptr_t
+    gemm_batch_contig_atomic_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static gemm_batch_contig_impl_fn_ptr_t
+    gemm_batch_contig_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void init_dot_dispatch_tables(void)
+{
+    td_ns::DispatchTableBuilder<int, DotTypeMapFactory, td_ns::num_types> dtb1;
+    dtb1.populate_dispatch_table(dot_output_id_table);
+
+    td_ns::DispatchTableBuilder<gemm_batch_impl_fn_ptr_t,
+                                GemmBatchAtomicFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(gemm_batch_atomic_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_batch_contig_impl_fn_ptr_t,
+                                GemmBatchContigAtomicFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(gemm_batch_contig_atomic_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_impl_fn_ptr_t, GemmAtomicFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(gemm_atomic_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_contig_impl_fn_ptr_t,
+                                GemmContigAtomicFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(gemm_contig_atomic_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_batch_impl_fn_ptr_t, GemmBatchTempsFactory,
+                                td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(gemm_batch_temps_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_batch_contig_impl_fn_ptr_t,
+                                GemmBatchContigTempsFactory, td_ns::num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(gemm_batch_contig_temps_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_impl_fn_ptr_t, GemmTempsFactory,
+                                td_ns::num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(gemm_temps_dispatch_table);
+
+    td_ns::DispatchTableBuilder<gemm_contig_impl_fn_ptr_t,
+                                GemmContigTempsFactory, td_ns::num_types>
+        dtb9;
+    dtb9.populate_dispatch_table(gemm_contig_temps_dispatch_table);
+
+    td_ns::DispatchTableBuilder<dot_product_impl_fn_ptr_t,
+                                DotProductAtomicFactory, td_ns::num_types>
+        dtb10;
+    dtb10.populate_dispatch_table(dot_product_dispatch_table);
+
+    td_ns::DispatchTableBuilder<dot_product_impl_fn_ptr_t,
+                                DotProductNoAtomicFactory, td_ns::num_types>
+        dtb11;
+    dtb11.populate_dispatch_table(dot_product_temps_dispatch_table);
+
+    td_ns::DispatchTableBuilder<dot_product_contig_impl_fn_ptr_t,
+                                DotProductContigAtomicFactory, td_ns::num_types>
+        dtb12;
+    dtb12.populate_dispatch_table(dot_product_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<dot_product_contig_impl_fn_ptr_t,
+                                DotProductContigNoAtomicFactory,
+                                td_ns::num_types>
+        dtb13;
+    dtb13.populate_dispatch_table(dot_product_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t dot_atomic_support_vector[td_ns::num_types];
+
+void init_dot_atomic_support_vector(void)
+{
+
+    using atomic_support::DotAtomicSupportFactory;
+    td_ns::DispatchVectorBuilder<atomic_support_fn_ptr_t,
+                                 DotAtomicSupportFactory, td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(dot_atomic_support_vector);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_dot(const dpctl::tensor::usm_ndarray &x1,
+           const dpctl::tensor::usm_ndarray &x2,
+           int batch_dims,
+           int x1_outer_dims,
+           int x2_outer_dims,
+           int inner_dims,
+           const dpctl::tensor::usm_ndarray &dst,
+           sycl::queue &exec_q,
+           const std::vector<sycl::event> &depends)
+{
+    if (!dpctl::utils::queues_are_compatible(exec_q, {x1, x2, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if (inner_dims == 0) {
+        throw py::value_error("No inner dimension for dot");
+    }
+
+    int x1_nd = x1.get_ndim();
+    int x2_nd = x2.get_ndim();
+    if (x1_nd != (batch_dims + x1_outer_dims + inner_dims) ||
+        x2_nd != (batch_dims + x2_outer_dims + inner_dims)) {
+        throw py::value_error("Input arrays do not have dimensions consistent "
+                              "with input dimensions");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != (batch_dims + x1_outer_dims + x2_outer_dims)) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of input dimensions");
+    }
+
+    const py::ssize_t *x1_shape_ptr = x1.get_shape_raw();
+    const py::ssize_t *x2_shape_ptr = x2.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t batches(1);
+    for (int i = 0; same_shapes && (i < batch_dims); ++i) {
+        same_shapes = same_shapes && (x1_shape_ptr[i] == dst_shape_ptr[i]) &&
+                      (x2_shape_ptr[i] == dst_shape_ptr[i]);
+        batches *= x1_shape_ptr[i];
+    }
+    std::size_t x1_outer_nelems(1);
+    for (int i = batch_dims; same_shapes && (i < (batch_dims + x1_outer_dims));
+         ++i) {
+        same_shapes = same_shapes && (x1_shape_ptr[i] == dst_shape_ptr[i]);
+        x1_outer_nelems *= x1_shape_ptr[i];
+    }
+    std::size_t inner_nelems(1);
+    for (int i = batch_dims; i < (batch_dims + inner_dims); ++i) {
+        auto x1_shape_idx = x1_outer_dims + i;
+        same_shapes =
+            same_shapes && (x1_shape_ptr[x1_shape_idx] == x2_shape_ptr[i]);
+        inner_nelems *= x1_shape_ptr[x1_shape_idx];
+    }
+    std::size_t x2_outer_nelems(1);
+    for (int i = 0; same_shapes && (i < x2_outer_dims); ++i) {
+        auto x2_shape_idx = batch_dims + inner_dims + i;
+        same_shapes =
+            same_shapes && (x2_shape_ptr[x2_shape_idx] ==
+                            dst_shape_ptr[batch_dims + x1_outer_dims + i]);
+        x2_outer_nelems *= x2_shape_ptr[x2_shape_idx];
+    }
+    if (!same_shapes) {
+        throw py::value_error("Input arrays to tensor dot product do not have "
+                              "appropriate shapes");
+    }
+
+    std::size_t dst_nelems = batches * x1_outer_nelems * x2_outer_nelems;
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    if (static_cast<std::size_t>(dst.get_size()) != dst_nelems) {
+        throw py::value_error("dst shape and size mismatch");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with x1 or x2
+    if (overlap(dst, x1) || overlap(dst, x2)) {
+        throw py::value_error("Result array overlaps with inputs");
+    }
+
+    int x1_typenum = x1.get_typenum();
+    int x2_typenum = x2.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int x1_typeid = array_types.typenum_to_lookup_id(x1_typenum);
+    int x2_typeid = array_types.typenum_to_lookup_id(x2_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    int output_typeid = dot_output_id_table[x1_typeid][x2_typeid];
+
+    if (output_typeid != dst_typeid) {
+        throw py::value_error(
+            "Result array has unexpected elemental data type.");
+    }
+
+    void *data_ptr = dst.get_data();
+    const auto &ctx = exec_q.get_context();
+    auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
+    bool supports_atomics =
+        dot_atomic_support_vector[output_typeid](exec_q, usm_type);
+
+    const char *x1_data = x1.get_data();
+    const char *x2_data = x2.get_data();
+    char *dst_data = dst.get_data();
+
+    const auto &x1_shape_vec = x1.get_shape_vector();
+    const auto &x1_strides_vec = x1.get_strides_vector();
+
+    const auto &x2_shape_vec = x2.get_shape_vector();
+    const auto &x2_strides_vec = x2.get_strides_vector();
+
+    const auto &dst_shape_vec = dst.get_shape_vector();
+    const auto &dst_strides_vec = dst.get_strides_vector();
+
+    bool is_x1_c_contig = x1.is_c_contiguous();
+    bool is_x1_f_contig = x1.is_f_contiguous();
+    bool is_x2_c_contig = x2.is_c_contiguous();
+    bool is_x2_f_contig = x2.is_f_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    bool call_vecdot = ((x1_outer_dims == 0 && x1_outer_nelems == 1) &&
+                        (x2_outer_dims == 0 && x2_outer_nelems == 1));
+
+    bool call_batched = (batch_dims != 0 || batches > 1);
+    std::vector<sycl::event> host_task_events{};
+    sycl::event dot_ev;
+    if (call_vecdot) {
+        if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig) ||
+            ((is_x1_f_contig && is_x2_f_contig) && !call_batched)) {
+            dot_product_contig_impl_fn_ptr_t fn = nullptr;
+            if (supports_atomics) {
+                fn = dot_product_contig_dispatch_table[x1_typeid][x2_typeid];
+            }
+            else {
+                fn = dot_product_contig_temps_dispatch_table[x1_typeid]
+                                                            [x2_typeid];
+            }
+            if (fn != nullptr) {
+                static constexpr py::ssize_t zero_offset = 0;
+                dot_ev = fn(exec_q, dst_nelems, inner_nelems, x1.get_data(),
+                            x2.get_data(), dst.get_data(),
+                            zero_offset, // lhs batch offset
+                            zero_offset, // rhs batch offset
+                            zero_offset, // res batch offset
+                            zero_offset, // lhs reduction offset
+                            zero_offset, // rhs reduction offset
+                            depends);
+                return std::make_pair(dpctl::utils::keep_args_alive(
+                                          exec_q, {x1, x2, dst}, {dot_ev}),
+                                      dot_ev);
+            }
+        }
+        int inner_nd = inner_dims;
+        const py::ssize_t *inner_shape_ptr = x1_shape_ptr + batch_dims;
+        using shT = std::vector<py::ssize_t>;
+        const shT inner_x1_strides(std::begin(x1_strides_vec) + batch_dims,
+                                   std::end(x1_strides_vec));
+        const shT inner_x2_strides(std::begin(x2_strides_vec) + batch_dims,
+                                   std::end(x2_strides_vec));
+
+        shT simplified_inner_shape;
+        shT simplified_inner_x1_strides;
+        shT simplified_inner_x2_strides;
+        py::ssize_t inner_x1_offset(0);
+        py::ssize_t inner_x2_offset(0);
+
+        simplify_iteration_space(
+            inner_nd, inner_shape_ptr, inner_x1_strides, inner_x2_strides,
+            // output
+            simplified_inner_shape, simplified_inner_x1_strides,
+            simplified_inner_x2_strides, inner_x1_offset, inner_x2_offset);
+
+        const py::ssize_t *batch_shape_ptr = x1_shape_ptr;
+
+        const shT batch_x1_strides(std::begin(x1_strides_vec),
+                                   std::begin(x1_strides_vec) + batch_dims);
+        const shT batch_x2_strides(std::begin(x2_strides_vec),
+                                   std::begin(x2_strides_vec) + batch_dims);
+        shT const &batch_dst_strides = dst_strides_vec;
+
+        shT simplified_batch_shape;
+        shT simplified_batch_x1_strides;
+        shT simplified_batch_x2_strides;
+        shT simplified_batch_dst_strides;
+        py::ssize_t batch_x1_offset(0);
+        py::ssize_t batch_x2_offset(0);
+        py::ssize_t batch_dst_offset(0);
+
+        if (batch_dims == 0) {
+            if (dst_nelems != 1) {
+                throw std::runtime_error(
+                    "batch_dims == 0, but dst_nelems != 1");
+            }
+            batch_dims = 1;
+            simplified_batch_shape.push_back(1);
+            simplified_batch_x1_strides.push_back(0);
+            simplified_batch_x2_strides.push_back(0);
+            simplified_batch_dst_strides.push_back(0);
+        }
+        else {
+            simplify_iteration_space_3(
+                batch_dims, batch_shape_ptr, batch_x1_strides, batch_x2_strides,
+                batch_dst_strides,
+                // output
+                simplified_batch_shape, simplified_batch_x1_strides,
+                simplified_batch_x2_strides, simplified_batch_dst_strides,
+                batch_x1_offset, batch_x2_offset, batch_dst_offset);
+        }
+
+        if (inner_nd == 1 && batch_dims == 1) {
+            bool dot_product_c_contig = false;
+            bool reduce_all_elems = false;
+
+            if (simplified_inner_x1_strides[0] == 1 &&
+                simplified_inner_x2_strides[0] == 1) {
+                reduce_all_elems = (simplified_batch_shape[0] == 1);
+                dot_product_c_contig =
+                    (simplified_batch_dst_strides[0] == 1) &&
+                    (static_cast<std::size_t>(simplified_batch_x1_strides[0]) ==
+                     inner_nelems) &&
+                    (static_cast<std::size_t>(simplified_batch_x2_strides[0]) ==
+                     inner_nelems);
+            }
+
+            if (dot_product_c_contig || reduce_all_elems) {
+                dot_product_contig_impl_fn_ptr_t fn = nullptr;
+                if (supports_atomics) {
+                    fn =
+                        dot_product_contig_dispatch_table[x1_typeid][x2_typeid];
+                }
+                else {
+                    fn = dot_product_contig_temps_dispatch_table[x1_typeid]
+                                                                [x2_typeid];
+                }
+                if (fn != nullptr) {
+                    dot_ev = fn(exec_q, dst_nelems, inner_nelems, x1.get_data(),
+                                x2.get_data(), dst.get_data(),
+                                batch_x1_offset,  // lhs batch offset
+                                batch_x2_offset,  // rhs batch offset
+                                batch_dst_offset, // res batch offset
+                                inner_x1_offset,  // lhs reduction offset
+                                inner_x2_offset,  // rhs reduction offset
+                                depends);
+                    return std::make_pair(dpctl::utils::keep_args_alive(
+                                              exec_q, {x1, x2, dst}, {dot_ev}),
+                                          dot_ev);
+                }
+            }
+        }
+
+        dot_product_impl_fn_ptr_t fn = nullptr;
+        if (supports_atomics) {
+            fn = dot_product_dispatch_table[x1_typeid][x2_typeid];
+        }
+        if (fn == nullptr) {
+            fn = dot_product_temps_dispatch_table[x1_typeid][x2_typeid];
+            if (fn == nullptr) {
+                throw std::runtime_error(
+                    "Implementation is missing for x1_typeid=" +
+                    std::to_string(x1_typeid) +
+                    " and x2_typeid=" + std::to_string(x2_typeid));
+            }
+        }
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto arrays_metainfo_packing_triple_ =
+            device_allocate_and_pack<py::ssize_t>(
+                exec_q, host_task_events,
+                // iteration metadata
+                simplified_batch_shape, simplified_batch_x1_strides,
+                simplified_batch_x2_strides, simplified_batch_dst_strides,
+                // reduction metadata
+                simplified_inner_shape, simplified_inner_x1_strides,
+                simplified_inner_x2_strides);
+        auto tmp_alloc_owner =
+            std::move(std::get<0>(arrays_metainfo_packing_triple_));
+        const auto &copy_metadata_ev =
+            std::get<2>(arrays_metainfo_packing_triple_);
+        const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get();
+
+        const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+        const py::ssize_t *inner_shape_stride =
+            temp_allocation_ptr + 4 * simplified_batch_shape.size();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.resize(depends.size());
+        std::copy(depends.begin(), depends.end(), all_deps.begin());
+        all_deps.push_back(copy_metadata_ev);
+
+        dot_ev =
+            fn(exec_q, dst_nelems, inner_nelems, x1.get_data(), x2.get_data(),
+               dst.get_data(), batch_dims, iter_shape_and_strides,
+               batch_x1_offset, batch_x2_offset, batch_dst_offset,
+               inner_nd, // number dimensions being reduced
+               inner_shape_stride, inner_x1_offset, inner_x2_offset, all_deps);
+
+        sycl::event temp_cleanup_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(exec_q, {dot_ev},
+                                                         tmp_alloc_owner);
+        host_task_events.push_back(temp_cleanup_ev);
+    }
+    else { // if (!call_vecdot)
+        if (!call_batched) {
+            if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig)) {
+                gemm_contig_impl_fn_ptr_t fn = nullptr;
+                if (supports_atomics) {
+                    fn =
+                        gemm_contig_atomic_dispatch_table[x1_typeid][x2_typeid];
+                }
+                else {
+                    fn = gemm_contig_temps_dispatch_table[x1_typeid][x2_typeid];
+                }
+                if (fn != nullptr) {
+                    dot_ev = fn(exec_q, x1_data, x2_data, dst_data,
+                                x1_outer_nelems, // n
+                                inner_nelems,    // k
+                                x2_outer_nelems, // m
+                                depends);
+                    return std::make_pair(dpctl::utils::keep_args_alive(
+                                              exec_q, {x1, x2, dst}, {dot_ev}),
+                                          dot_ev);
+                }
+            }
+            gemm_impl_fn_ptr_t fn = nullptr;
+            if (supports_atomics) {
+                fn = gemm_atomic_dispatch_table[x1_typeid][x2_typeid];
+            }
+            if (fn == nullptr) {
+                fn = gemm_temps_dispatch_table[x1_typeid][x2_typeid];
+                if (fn == nullptr) {
+                    throw std::runtime_error(
+                        "Implementation is missing for x1_typeid=" +
+                        std::to_string(x1_typeid) +
+                        " and x2_typeid=" + std::to_string(x2_typeid));
+                }
+            }
+            using dpctl::tensor::offset_utils::device_allocate_and_pack;
+            auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+                exec_q, host_task_events, x1_shape_vec, x1_strides_vec,
+                x2_shape_vec, x2_strides_vec, dst_shape_vec, dst_strides_vec);
+            auto packed_shapes_strides_owner =
+                std::move(std::get<0>(ptr_size_event_tuple1));
+            sycl::event copy_shapes_strides_ev =
+                std::get<2>(ptr_size_event_tuple1);
+            const py::ssize_t *packed_shapes_strides =
+                packed_shapes_strides_owner.get();
+
+            const py::ssize_t *x1_shape_strides = packed_shapes_strides;
+            const py::ssize_t *x2_shape_strides =
+                packed_shapes_strides + 2 * (x1_nd);
+            const py::ssize_t *dst_shape_strides =
+                packed_shapes_strides + 2 * (x1_nd + x2_nd);
+
+            std::vector<sycl::event> all_deps;
+            all_deps.reserve(depends.size() + 1);
+            all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+            all_deps.push_back(copy_shapes_strides_ev);
+
+            // change gemm calls to pass inner dims and outer dims separately
+            dot_ev =
+                fn(exec_q, x1_data, x2_data, dst_data, x1_outer_nelems,
+                   inner_nelems, x2_outer_nelems, inner_dims, x1_outer_dims,
+                   x1_shape_strides, x2_outer_dims, x2_shape_strides,
+                   x1_outer_dims + x2_outer_dims, dst_shape_strides, all_deps);
+
+            sycl::event cleanup_tmp_allocations_ev =
+                dpctl::tensor::alloc_utils::async_smart_free(
+                    exec_q, {dot_ev}, packed_shapes_strides_owner);
+            host_task_events.push_back(cleanup_tmp_allocations_ev);
+        }
+        else { // if (call_batched)
+            using shT = std::vector<py::ssize_t>;
+            // temporary asserts for matmul
+            assert(x1_outer_dims == 1);
+            assert(x2_outer_dims == 1);
+            assert(inner_dims == 1);
+
+            if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig)) {
+                gemm_batch_contig_impl_fn_ptr_t fn = nullptr;
+                if (supports_atomics) {
+                    fn = gemm_batch_contig_atomic_dispatch_table[x1_typeid]
+                                                                [x2_typeid];
+                }
+                else {
+                    fn = gemm_batch_contig_temps_dispatch_table[x1_typeid]
+                                                               [x2_typeid];
+                }
+                if (fn != nullptr) {
+                    static constexpr py::ssize_t zero_offset = 0;
+                    dot_ev = fn(exec_q, x1_data, x2_data, dst_data, batches,
+                                x1_outer_nelems, // n
+                                inner_nelems,    // k
+                                x2_outer_nelems, // m
+                                zero_offset, zero_offset, zero_offset, depends);
+                    return std::make_pair(dpctl::utils::keep_args_alive(
+                                              exec_q, {x1, x2, dst}, {dot_ev}),
+                                          dot_ev);
+                }
+            }
+
+            auto x1_outer_inner_dims = x1_nd - batch_dims;
+            auto x2_outer_inner_dims = x2_nd - batch_dims;
+            auto dst_outer_inner_dims = dst_nd - batch_dims;
+
+            shT batch_x1_shape;
+            shT outer_inner_x1_shape;
+            shT batch_x1_strides;
+            shT outer_inner_x1_strides;
+            split_iteration_space(x1_shape_vec, x1_strides_vec, batch_dims,
+                                  batch_dims + x1_outer_inner_dims,
+                                  // 4 vectors modified
+                                  batch_x1_shape, outer_inner_x1_shape,
+                                  batch_x1_strides, outer_inner_x1_strides);
+
+            shT batch_x2_shape;
+            shT outer_inner_x2_shape;
+            shT batch_x2_strides;
+            shT outer_inner_x2_strides;
+            split_iteration_space(x2_shape_vec, x2_strides_vec, batch_dims,
+                                  batch_dims + x2_outer_inner_dims,
+                                  // 4 vectors modified
+                                  batch_x2_shape, outer_inner_x2_shape,
+                                  batch_x2_strides, outer_inner_x2_strides);
+
+            shT batch_dst_shape;
+            shT outer_inner_dst_shape;
+            shT batch_dst_strides;
+            shT outer_inner_dst_strides;
+            split_iteration_space(dst_shape_vec, dst_strides_vec, batch_dims,
+                                  batch_dims + dst_outer_inner_dims,
+                                  // 4 vectors modified
+                                  batch_dst_shape, outer_inner_dst_shape,
+                                  batch_dst_strides, outer_inner_dst_strides);
+
+            using shT = std::vector<py::ssize_t>;
+            shT simplified_batch_shape;
+            shT simplified_batch_x1_strides;
+            shT simplified_batch_x2_strides;
+            shT simplified_batch_dst_strides;
+            py::ssize_t x1_batch_offset(0);
+            py::ssize_t x2_batch_offset(0);
+            py::ssize_t dst_batch_offset(0);
+
+            const py::ssize_t *shape = x1_shape_ptr;
+
+            simplify_iteration_space_3(
+                batch_dims, shape, batch_x1_strides, batch_x2_strides,
+                batch_dst_strides,
+                // outputs
+                simplified_batch_shape, simplified_batch_x1_strides,
+                simplified_batch_x2_strides, simplified_batch_dst_strides,
+                x1_batch_offset, x2_batch_offset, dst_batch_offset);
+
+            if (batch_dims == 1 && x1_outer_dims == 1 && x2_outer_dims == 1 &&
+                inner_dims == 1) {
+                bool gemm_batch_c_contig = false;
+
+                if ((static_cast<std::size_t>(outer_inner_x1_strides[0]) ==
+                         inner_nelems &&
+                     outer_inner_x1_strides[1] == 1) &&
+                    (static_cast<std::size_t>(outer_inner_x2_strides[0]) ==
+                         inner_nelems &&
+                     outer_inner_x2_strides[1] == 1) &&
+                    (static_cast<std::size_t>(outer_inner_dst_strides[0]) ==
+                         x2_outer_nelems &&
+                     outer_inner_dst_strides[1] == 1)) {
+                    gemm_batch_c_contig =
+                        (static_cast<std::size_t>(
+                             simplified_batch_x1_strides[0]) ==
+                         x1_outer_nelems * inner_nelems) &&
+                        (static_cast<std::size_t>(
+                             simplified_batch_x2_strides[0]) ==
+                         x2_outer_nelems * inner_nelems) &&
+                        (static_cast<std::size_t>(
+                             simplified_batch_dst_strides[0]) ==
+                         x1_outer_nelems * x2_outer_nelems);
+                }
+
+                if (gemm_batch_c_contig) {
+                    gemm_batch_contig_impl_fn_ptr_t fn = nullptr;
+                    if (supports_atomics) {
+                        fn = gemm_batch_contig_atomic_dispatch_table[x1_typeid]
+                                                                    [x2_typeid];
+                    }
+                    else {
+                        fn = gemm_batch_contig_temps_dispatch_table[x1_typeid]
+                                                                   [x2_typeid];
+                    }
+                    if (fn != nullptr) {
+                        dot_ev = fn(exec_q, x1_data, x2_data, dst_data, batches,
+                                    x1_outer_nelems, // n
+                                    inner_nelems,    // k
+                                    x2_outer_nelems, // m
+                                    x1_batch_offset, x2_batch_offset,
+                                    dst_batch_offset, depends);
+                        return std::make_pair(
+                            dpctl::utils::keep_args_alive(exec_q, {x1, x2, dst},
+                                                          {dot_ev}),
+                            dot_ev);
+                    }
+                }
+            }
+
+            gemm_batch_impl_fn_ptr_t fn = nullptr;
+            if (supports_atomics) {
+                fn = gemm_batch_atomic_dispatch_table[x1_typeid][x2_typeid];
+            }
+            if (fn == nullptr) {
+                fn = gemm_batch_temps_dispatch_table[x1_typeid][x2_typeid];
+                if (fn == nullptr) {
+                    throw std::runtime_error(
+                        "Implementation is missing for x1_typeid=" +
+                        std::to_string(x1_typeid) +
+                        " and x2_typeid=" + std::to_string(x2_typeid));
+                }
+            }
+            using dpctl::tensor::offset_utils::device_allocate_and_pack;
+            auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+                exec_q, host_task_events, simplified_batch_shape,
+                simplified_batch_x1_strides, simplified_batch_x2_strides,
+                simplified_batch_dst_strides, outer_inner_x1_shape,
+                outer_inner_x1_strides, outer_inner_x2_shape,
+                outer_inner_x2_strides, outer_inner_dst_shape,
+                outer_inner_dst_strides,
+                // full shape and strides of the result array
+                // necessary for reduction and initialization
+                simplified_batch_shape, outer_inner_dst_shape,
+                simplified_batch_dst_strides, outer_inner_dst_strides);
+            auto packed_shapes_strides_owner =
+                std::move(std::get<0>(ptr_size_event_tuple1));
+            sycl::event copy_shapes_strides_ev =
+                std::get<2>(ptr_size_event_tuple1);
+            const py::ssize_t *packed_shapes_strides =
+                packed_shapes_strides_owner.get();
+
+            const auto batch_shape_strides = packed_shapes_strides;
+            const auto x1_outer_inner_shapes_strides =
+                packed_shapes_strides + 4 * batch_dims;
+            const auto x2_outer_inner_shapes_strides =
+                packed_shapes_strides + 4 * batch_dims +
+                2 * (x1_outer_inner_dims);
+            const auto dst_outer_shapes_strides =
+                packed_shapes_strides + 4 * batch_dims +
+                2 * (x1_outer_inner_dims) + 2 * (x2_outer_inner_dims);
+            const auto dst_full_shape_strides =
+                packed_shapes_strides + 4 * batch_dims +
+                2 * (x1_outer_inner_dims) + 2 * (x2_outer_inner_dims) +
+                2 * (dst_outer_inner_dims);
+
+            std::vector<sycl::event> all_deps;
+            all_deps.reserve(depends.size() + 1);
+            all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+            all_deps.push_back(copy_shapes_strides_ev);
+
+            dot_ev = fn(
+                exec_q, x1_data, x2_data, dst_data, batches, x1_outer_nelems,
+                inner_nelems, x2_outer_nelems, batch_dims, batch_shape_strides,
+                x1_batch_offset, x2_batch_offset, dst_batch_offset, inner_dims,
+                x1_outer_dims, x1_outer_inner_shapes_strides, x2_outer_dims,
+                x2_outer_inner_shapes_strides, x1_outer_dims + x2_outer_dims,
+                dst_outer_shapes_strides, dst_full_shape_strides, all_deps);
+
+            sycl::event cleanup_tmp_allocations_ev =
+                dpctl::tensor::alloc_utils::async_smart_free(
+                    exec_q, {dot_ev}, packed_shapes_strides_owner);
+            host_task_events.push_back(cleanup_tmp_allocations_ev);
+        }
+    }
+    return std::make_pair(
+        dpctl::utils::keep_args_alive(exec_q, {x1, x2, dst}, host_task_events),
+        dot_ev);
+}
+
+template <typename output_typesT>
+py::object py_dot_result_type(const py::dtype &input1_dtype,
+                              const py::dtype &input2_dtype,
+                              const output_typesT &output_types_table)
+{
+    int tn1 = input1_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int tn2 = input2_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int src1_typeid = -1;
+    int src2_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        src1_typeid = array_types.typenum_to_lookup_id(tn1);
+        src2_typeid = array_types.typenum_to_lookup_id(tn2);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
+        src2_typeid >= td_ns::num_types) {
+        throw std::runtime_error("binary output type lookup failed");
+    }
+    int dst_typeid = output_types_table[src1_typeid][src2_typeid];
+
+    if (dst_typeid < 0) {
+        auto res = py::none();
+        return py::cast<py::object>(res);
+    }
+    else {
+        auto dst_typenum_t = static_cast<td_ns::typenum_t>(dst_typeid);
+        auto dt = type_utils::_dtype_from_typenum(dst_typenum_t);
+
+        return py::cast<py::object>(dt);
+    }
+}
+
+void init_dot(py::module_ m)
+{
+    init_dot_atomic_support_vector();
+    init_dot_dispatch_tables();
+
+    m.def("_dot", &py_dot, "", py::arg("x1"), py::arg("x2"),
+          py::arg("batch_dims"), py::arg("x1_outer_dims"),
+          py::arg("x2_outer_dims"), py::arg("inner_dims"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto dot_result_type_pyapi = [&](const py::dtype &dtype1,
+                                     const py::dtype &dtype2) {
+        return py_dot_result_type(dtype1, dtype2, dot_output_id_table);
+    };
+    m.def("_dot_result_type", dot_result_type_pyapi, "");
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/linalg_functions/dot.hpp b/dpnp/tensor/libtensor/source/linalg_functions/dot.hpp
new file mode 100644
index 000000000000..f6a23ace5cd9
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/linalg_functions/dot.hpp
@@ -0,0 +1,45 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_dot(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp b/dpnp/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp
new file mode 100644
index 000000000000..66b9b5004575
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp
@@ -0,0 +1,58 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+
+#include "reductions/reduction_atomic_support.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::py_internal::atomic_support
+{
+
+template <typename fnT, typename T>
+struct DotAtomicSupportFactory
+{
+    fnT get()
+    {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<T>::value) {
+            return atomic_support::fixed_decision<false>;
+        }
+        else {
+            return atomic_support::check_atomic_support<T>;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::py_internal::atomic_support
diff --git a/dpnp/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp b/dpnp/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp
new file mode 100644
index 000000000000..984f71a4c183
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp
@@ -0,0 +1,405 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+
+#include "kernels/linalg_functions/dot_product.hpp"
+#include "kernels/linalg_functions/gemm.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename T1, typename T2>
+struct DotAtomicOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, double>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+// add separate type support lists for atomic vs. temps
+// gemm, gevm, and dot product share output type struct
+template <typename T1, typename T2>
+struct DotNoAtomicOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<double>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+template <typename fnT, typename T1, typename T2>
+struct DotTypeMapFactory
+{
+    /*! @brief get typeid for output type of kernels called by py_dot */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT1 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+        using rT2 = typename DotAtomicOutputType<T1, T2>::value_type;
+        static_assert(std::is_same_v<rT1, rT2> || std::is_same_v<rT2, void>);
+        return td_ns::GetTypeid<rT1>{}.get();
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmBatchAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_batch_impl;
+            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_batch_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmBatchContigAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_batch_contig_impl;
+            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_batch_contig_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_impl;
+            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmContigAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_contig_impl;
+            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_contig_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmTempsFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_tree_impl;
+            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_tree_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmContigTempsFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_contig_tree_impl;
+            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_contig_tree_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmBatchTempsFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_batch_tree_impl;
+            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_batch_tree_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct GemmBatchContigTempsFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::gemm_batch_contig_tree_impl;
+            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+            fnT fn = gemm_batch_contig_tree_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct DotProductAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::dot_product_impl;
+            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
+            fnT fn = dot_product_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct DotProductNoAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::dot_product_tree_impl;
+            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+            fnT fn = dot_product_tree_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct DotProductContigAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::dot_product_contig_impl;
+            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
+            fnT fn = dot_product_contig_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct DotProductContigNoAtomicFactory
+{
+    fnT get()
+    {
+        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using dpctl::tensor::kernels::dot_product_contig_tree_impl;
+            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
+            fnT fn = dot_product_contig_tree_impl<T1, T2, T3>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/linear_sequences.cpp b/dpnp/tensor/libtensor/source/linear_sequences.cpp
new file mode 100644
index 000000000000..9a7bf2dbcc0f
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/linear_sequences.cpp
@@ -0,0 +1,306 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===---------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/complex.h> // py::cast<std::complex<T>>
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "linear_sequences.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+// Constructor to populate tensor with linear sequence defined by
+// start and step data
+
+typedef sycl::event (*lin_space_step_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t, // num_elements
+    const py::object &start,
+    const py::object &step,
+    char *, // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified by starting value and increment
+ * given as Python objects.
+ *
+ * @param q  Sycl queue to which the kernel is submitted
+ * @param nelems Length of the sequence
+ * @param start Starting value of the sequence as Python object. Must be
+ * convertible to array element data type `Ty`.
+ * @param step  Increment of the sequence as Python object. Must be convertible
+ * to array element data type `Ty`.
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_step_impl(sycl::queue &exec_q,
+                                std::size_t nelems,
+                                const py::object &start,
+                                const py::object &step,
+                                char *array_data,
+                                const std::vector<sycl::event> &depends)
+{
+    Ty start_v = py::cast<Ty>(start);
+    Ty step_v = py::cast<Ty>(step);
+
+    using dpctl::tensor::kernels::constructors::lin_space_step_impl;
+
+    auto lin_space_step_event = lin_space_step_impl<Ty>(
+        exec_q, nelems, start_v, step_v, array_data, depends);
+
+    return lin_space_step_event;
+}
+
+typedef sycl::event (*lin_space_affine_fn_ptr_t)(
+    sycl::queue &,
+    std::size_t, // num_elements
+    const py::object &start,
+    const py::object &end,
+    bool include_endpoint,
+    char *, // dst_data_ptr
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to populate given contiguous memory
+ * allocation with linear sequence specified  by starting and end values given
+ * as Python objects.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param start Stating value of the sequence as Python object. Must be
+ * convertible to array data element type `Ty`.
+ * @param end   End-value of the sequence as Python object. Must be convertible
+ * to array data element type `Ty`.
+ * @param include_endpoint  Whether the end-value is included in the sequence
+ * @param array_data Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename Ty>
+sycl::event lin_space_affine_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const py::object &start,
+                                  const py::object &end,
+                                  bool include_endpoint,
+                                  char *array_data,
+                                  const std::vector<sycl::event> &depends)
+{
+    Ty start_v = py::cast<Ty>(start);
+    Ty end_v = py::cast<Ty>(end);
+
+    using dpctl::tensor::kernels::constructors::lin_space_affine_impl;
+
+    auto lin_space_affine_event = lin_space_affine_impl<Ty>(
+        exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends);
+
+    return lin_space_affine_event;
+}
+
+using dpctl::utils::keep_args_alive;
+
+static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types];
+
+static lin_space_affine_fn_ptr_t
+    lin_space_affine_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_linear_sequence_step(const py::object &start,
+                                     const py::object &dt,
+                                     const dpctl::tensor::usm_ndarray &dst,
+                                     sycl::queue &exec_q,
+                                     const std::vector<sycl::event> &depends)
+{
+    // dst must be 1D and C-contiguous
+    // start, end should be coercible into data type of dst
+
+    if (dst.get_ndim() != 1) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Expecting 1D array to populate");
+    }
+
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::ssize_t len = dst.get_shape(0);
+    if (len == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *dst_data = dst.get_data();
+    sycl::event linspace_step_event;
+
+    auto fn = lin_space_step_dispatch_vector[dst_typeid];
+
+    linspace_step_event =
+        fn(exec_q, static_cast<std::size_t>(len), start, dt, dst_data, depends);
+
+    return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}),
+                          linspace_step_event);
+}
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_linear_sequence_affine(const py::object &start,
+                                       const py::object &end,
+                                       const dpctl::tensor::usm_ndarray &dst,
+                                       bool include_endpoint,
+                                       sycl::queue &exec_q,
+                                       const std::vector<sycl::event> &depends)
+{
+    // dst must be 1D and C-contiguous
+    // start, end should be coercible into data type of dst
+
+    if (dst.get_ndim() != 1) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Expecting 1D array to populate");
+    }
+
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error(
+            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue context is not the same as allocation context");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::ssize_t len = dst.get_shape(0);
+    if (len == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    char *dst_data = dst.get_data();
+    sycl::event linspace_affine_event;
+
+    auto fn = lin_space_affine_dispatch_vector[dst_typeid];
+
+    linspace_affine_event = fn(exec_q, static_cast<std::size_t>(len), start,
+                               end, include_endpoint, dst_data, depends);
+
+    return std::make_pair(
+        keep_args_alive(exec_q, {dst}, {linspace_affine_event}),
+        linspace_affine_event);
+}
+
+/*!
+ * @brief  Factor to get function pointer of type `fnT` for array with elements
+ * of type `Ty`.
+ * @defgroup CtorKernels
+ */
+template <typename fnT, typename Ty>
+struct LinSpaceStepFactory
+{
+    fnT get()
+    {
+        fnT f = lin_space_step_impl<Ty>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for array data type
+ * `Ty`.
+ */
+template <typename fnT, typename Ty>
+struct LinSpaceAffineFactory
+{
+    fnT get()
+    {
+        fnT f = lin_space_affine_impl<Ty>;
+        return f;
+    }
+};
+
+void init_linear_sequences_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<lin_space_step_fn_ptr_t, LinSpaceStepFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector);
+
+    DispatchVectorBuilder<lin_space_affine_fn_ptr_t, LinSpaceAffineFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/linear_sequences.hpp b/dpnp/tensor/libtensor/source/linear_sequences.hpp
new file mode 100644
index 000000000000..45cf45153462
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/linear_sequences.hpp
@@ -0,0 +1,66 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_step(
+    const py::object &start,
+    const py::object &dt,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_affine(
+    const py::object &start,
+    const py::object &end,
+    const dpctl::tensor::usm_ndarray &dst,
+    bool include_endpoint,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_linear_sequences_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/all.cpp b/dpnp/tensor/libtensor/source/reductions/all.cpp
new file mode 100644
index 000000000000..a901b9e1d9a3
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/all.cpp
@@ -0,0 +1,164 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    all_reduction_strided_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    all_reduction_axis1_contig_dispatch_vector[td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    all_reduction_axis0_contig_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename srcTy>
+struct AllStridedFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_and<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                           ReductionOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy>
+struct AllAxis1ContigFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_and<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_axis1_over_group_with_atomics_contig_impl<srcTy, dstTy,
+                                                                ReductionOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy>
+struct AllAxis0ContigFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_and<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_axis0_over_group_with_atomics_contig_impl<srcTy, dstTy,
+                                                                ReductionOpT>;
+    }
+};
+
+void populate_all_dispatch_vectors(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    DispatchVectorBuilder<reduction_strided_impl_fn_ptr, AllStridedFactory,
+                          td_ns::num_types>
+        all_dvb1;
+    all_dvb1.populate_dispatch_vector(all_reduction_strided_dispatch_vector);
+
+    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AllAxis1ContigFactory,
+                          td_ns::num_types>
+        all_dvb2;
+    all_dvb2.populate_dispatch_vector(
+        all_reduction_axis1_contig_dispatch_vector);
+
+    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AllAxis0ContigFactory,
+                          td_ns::num_types>
+        all_dvb3;
+    all_dvb3.populate_dispatch_vector(
+        all_reduction_axis0_contig_dispatch_vector);
+};
+
+using atomic_support::atomic_support_fn_ptr_t;
+using atomic_support::check_atomic_support;
+static atomic_support_fn_ptr_t all_atomic_support =
+    check_atomic_support<std::int32_t>;
+
+} // namespace impl
+
+void init_all(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_all_dispatch_vectors();
+        using impl::all_reduction_axis0_contig_dispatch_vector;
+        using impl::all_reduction_axis1_contig_dispatch_vector;
+        using impl::all_reduction_strided_dispatch_vector;
+
+        using impl::all_atomic_support;
+
+        auto all_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_boolean_reduction(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                all_reduction_axis1_contig_dispatch_vector,
+                all_reduction_axis0_contig_dispatch_vector,
+                all_reduction_strided_dispatch_vector, all_atomic_support);
+        };
+        m.def("_all", all_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/all.hpp b/dpnp/tensor/libtensor/source/reductions/all.hpp
new file mode 100644
index 000000000000..5fb184e37c66
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/all.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_all(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/any.cpp b/dpnp/tensor/libtensor/source/reductions/any.cpp
new file mode 100644
index 000000000000..6859e46cbc4a
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/any.cpp
@@ -0,0 +1,164 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    any_reduction_strided_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    any_reduction_axis1_contig_dispatch_vector[td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    any_reduction_axis0_contig_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename srcTy>
+struct AnyStridedFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_or<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                           ReductionOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy>
+struct AnyAxis1ContigFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_or<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_axis1_over_group_with_atomics_contig_impl<srcTy, dstTy,
+                                                                ReductionOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy>
+struct AnyAxis0ContigFactory
+{
+    fnT get() const
+    {
+        using dstTy = std::int32_t;
+        using ReductionOpT = sycl::logical_or<dstTy>;
+        return dpctl::tensor::kernels::
+            reduction_axis0_over_group_with_atomics_contig_impl<srcTy, dstTy,
+                                                                ReductionOpT>;
+    }
+};
+
+void populate_any_dispatch_vectors(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    DispatchVectorBuilder<reduction_strided_impl_fn_ptr, AnyStridedFactory,
+                          td_ns::num_types>
+        any_dvb1;
+    any_dvb1.populate_dispatch_vector(any_reduction_strided_dispatch_vector);
+
+    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AnyAxis1ContigFactory,
+                          td_ns::num_types>
+        any_dvb2;
+    any_dvb2.populate_dispatch_vector(
+        any_reduction_axis1_contig_dispatch_vector);
+
+    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AnyAxis0ContigFactory,
+                          td_ns::num_types>
+        any_dvb3;
+    any_dvb3.populate_dispatch_vector(
+        any_reduction_axis0_contig_dispatch_vector);
+};
+
+using atomic_support::atomic_support_fn_ptr_t;
+using atomic_support::check_atomic_support;
+static atomic_support_fn_ptr_t any_atomic_support =
+    check_atomic_support<std::int32_t>;
+
+} // namespace impl
+
+void init_any(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_any_dispatch_vectors();
+        using impl::any_reduction_axis0_contig_dispatch_vector;
+        using impl::any_reduction_axis1_contig_dispatch_vector;
+        using impl::any_reduction_strided_dispatch_vector;
+
+        using impl::any_atomic_support;
+
+        auto any_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_boolean_reduction(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                any_reduction_axis1_contig_dispatch_vector,
+                any_reduction_axis0_contig_dispatch_vector,
+                any_reduction_strided_dispatch_vector, any_atomic_support);
+        };
+        m.def("_any", any_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/any.hpp b/dpnp/tensor/libtensor/source/reductions/any.hpp
new file mode 100644
index 000000000000..4e368a674615
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/any.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_any(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/argmax.cpp b/dpnp/tensor/libtensor/source/reductions/argmax.cpp
new file mode 100644
index 000000000000..af602371dfc5
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/argmax.cpp
@@ -0,0 +1,276 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
+static search_strided_impl_fn_ptr
+    argmax_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmax_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmax_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportForArgmaxReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::int64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, std::int64_t>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::int64_t>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::int64_t>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::int64_t>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::int64_t>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgmaxOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgmaxOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgmaxOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgmaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_argmax_over_axis_dispatch_tables(void)
+{
+    using td_ns::DispatchTableBuilder;
+
+    DispatchTableBuilder<search_strided_impl_fn_ptr,
+                         ArgmaxOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(argmax_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgmaxOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(argmax_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgmaxOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(argmax_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_argmax(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_argmax_over_axis_dispatch_tables;
+        populate_argmax_over_axis_dispatch_tables();
+        using impl::argmax_over_axis0_contig_temps_dispatch_table;
+        using impl::argmax_over_axis1_contig_temps_dispatch_table;
+        using impl::argmax_over_axis_strided_temps_dispatch_table;
+
+        auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                                const arrayT &dst, sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            return py_search_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                argmax_over_axis_strided_temps_dispatch_table,
+                argmax_over_axis0_contig_temps_dispatch_table,
+                argmax_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_argmax_over_axis", argmax_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/argmax.hpp b/dpnp/tensor/libtensor/source/reductions/argmax.hpp
new file mode 100644
index 000000000000..3274f8c7d0cb
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/argmax.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_argmax(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/argmin.cpp b/dpnp/tensor/libtensor/source/reductions/argmin.cpp
new file mode 100644
index 000000000000..4869b75eacf9
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/argmin.cpp
@@ -0,0 +1,276 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
+static search_strided_impl_fn_ptr
+    argmin_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmin_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
+static search_contig_impl_fn_ptr
+    argmin_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportForArgminReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::int64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, std::int64_t>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::int64_t>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::int64_t>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::int64_t>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::int64_t>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgminOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgminReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgminOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgminReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis1_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgminOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportForArgminReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_axis0_over_group_temps_contig_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_argmin_over_axis_dispatch_tables(void)
+{
+    using td_ns::DispatchTableBuilder;
+
+    DispatchTableBuilder<search_strided_impl_fn_ptr,
+                         ArgminOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(argmin_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgminOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(argmin_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<search_contig_impl_fn_ptr,
+                         ArgminOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(argmin_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_argmin(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_argmin_over_axis_dispatch_tables;
+        populate_argmin_over_axis_dispatch_tables();
+        using impl::argmin_over_axis0_contig_temps_dispatch_table;
+        using impl::argmin_over_axis1_contig_temps_dispatch_table;
+        using impl::argmin_over_axis_strided_temps_dispatch_table;
+
+        auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                                const arrayT &dst, sycl::queue &exec_q,
+                                const event_vecT &depends = {}) {
+            return py_search_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                argmin_over_axis_strided_temps_dispatch_table,
+                argmin_over_axis0_contig_temps_dispatch_table,
+                argmin_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_argmin_over_axis", argmin_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/argmin.hpp b/dpnp/tensor/libtensor/source/reductions/argmin.hpp
new file mode 100644
index 000000000000..1865c258a527
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/argmin.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_argmin(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp b/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp
new file mode 100644
index 000000000000..351eab82ee6b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/logsumexp.cpp
@@ -0,0 +1,255 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    logsumexp_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    logsumexp_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    logsumexp_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                    [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForLogSumExpReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+#if 1
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+#endif
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct LogSumExpOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = su_ns::LogSumExp<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct LogSumExpOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = su_ns::LogSumExp<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct LogSumExpOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = su_ns::LogSumExp<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_logsumexp_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         LogSumExpOverAxisTempsStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(
+        logsumexp_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         LogSumExpOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(
+        logsumexp_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         LogSumExpOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(
+        logsumexp_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_logsumexp(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_logsumexp_over_axis_dispatch_tables;
+        populate_logsumexp_over_axis_dispatch_tables();
+        using impl::logsumexp_over_axis0_contig_temps_dispatch_table;
+        using impl::logsumexp_over_axis1_contig_temps_dispatch_table;
+        using impl::logsumexp_over_axis_strided_temps_dispatch_table;
+
+        using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+        using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+
+        auto logsumexp_pyapi = [&](const arrayT &src,
+                                   int trailing_dims_to_reduce,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_tree_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                logsumexp_over_axis_strided_temps_dispatch_table,
+                logsumexp_over_axis0_contig_temps_dispatch_table,
+                logsumexp_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_logsumexp_over_axis", logsumexp_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto logsumexp_dtype_supported = [&](const py::dtype &input_dtype,
+                                             const py::dtype &output_dtype) {
+            return py_tree_reduction_dtype_supported(
+                input_dtype, output_dtype,
+                logsumexp_over_axis_strided_temps_dispatch_table);
+        };
+        m.def("_logsumexp_over_axis_dtype_supported", logsumexp_dtype_supported,
+              "", py::arg("arg_dtype"), py::arg("out_dtype"));
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/logsumexp.hpp b/dpnp/tensor/libtensor/source/reductions/logsumexp.hpp
new file mode 100644
index 000000000000..2e2c19877db6
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/logsumexp.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logsumexp(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/max.cpp b/dpnp/tensor/libtensor/source/reductions/max.cpp
new file mode 100644
index 000000000000..628f7cfe8606
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/max.cpp
@@ -0,0 +1,407 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    max_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    max_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    max_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    max_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    max_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    max_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+/* @brief Types supported by max reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForMaxReductionAtomic
+{
+    /* value is true if a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction<
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForMaxReductionTemps
+{
+    static constexpr bool is_defined = std::disjunction<
+        // input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionAtomic<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionAtomic<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionAtomic<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMaxReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_max_over_axis_dispatch_tables(void)
+{
+    using td_ns::DispatchTableBuilder;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MaxOverAxisAtomicStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(max_over_axis_strided_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MaxOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(max_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis1AtomicContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(max_over_axis1_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis0AtomicContigFactory, td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(max_over_axis0_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(max_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(max_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t max_atomic_support_vector[td_ns::num_types];
+
+void populate_max_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::MaxAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, MaxAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(max_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_max(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_max_over_axis_dispatch_tables;
+        populate_max_over_axis_dispatch_tables();
+        using impl::max_over_axis0_contig_atomic_dispatch_table;
+        using impl::max_over_axis0_contig_temps_dispatch_table;
+        using impl::max_over_axis1_contig_atomic_dispatch_table;
+        using impl::max_over_axis1_contig_temps_dispatch_table;
+        using impl::max_over_axis_strided_atomic_dispatch_table;
+        using impl::max_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_max_atomic_support_dispatch_vector;
+        populate_max_atomic_support_dispatch_vector();
+        using impl::max_atomic_support_vector;
+
+        auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                max_over_axis_strided_atomic_dispatch_table,
+                max_over_axis0_contig_atomic_dispatch_table,
+                max_over_axis1_contig_atomic_dispatch_table,
+                max_over_axis_strided_temps_dispatch_table,
+                max_over_axis0_contig_temps_dispatch_table,
+                max_over_axis1_contig_temps_dispatch_table,
+                max_atomic_support_vector);
+        };
+        m.def("_max_over_axis", max_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/max.hpp b/dpnp/tensor/libtensor/source/reductions/max.hpp
new file mode 100644
index 000000000000..bc242dc8d74b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/max.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_max(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/min.cpp b/dpnp/tensor/libtensor/source/reductions/min.cpp
new file mode 100644
index 000000000000..68bfdb583b0b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/min.cpp
@@ -0,0 +1,409 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    min_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    min_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    min_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    min_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    min_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    min_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+/* @brief Types supported by min reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForMinReductionAtomic
+{
+    /* value is true if a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction<
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForMinReductionTemps
+{
+    static constexpr bool is_defined = std::disjunction<
+        // input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionAtomic<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionAtomic<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionAtomic<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForMinReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                                 ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_min_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using td_ns::DispatchTableBuilder;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MinOverAxisAtomicStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(min_over_axis_strided_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MinOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(min_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis1AtomicContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(min_over_axis1_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis0AtomicContigFactory, td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(min_over_axis0_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(min_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(min_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t min_atomic_support_vector[td_ns::num_types];
+
+void populate_min_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::MinAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, MinAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(min_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_min(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_min_over_axis_dispatch_tables;
+        populate_min_over_axis_dispatch_tables();
+        using impl::min_over_axis0_contig_atomic_dispatch_table;
+        using impl::min_over_axis0_contig_temps_dispatch_table;
+        using impl::min_over_axis1_contig_atomic_dispatch_table;
+        using impl::min_over_axis1_contig_temps_dispatch_table;
+        using impl::min_over_axis_strided_atomic_dispatch_table;
+        using impl::min_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_min_atomic_support_dispatch_vector;
+        populate_min_atomic_support_dispatch_vector();
+        using impl::min_atomic_support_vector;
+
+        auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                min_over_axis_strided_atomic_dispatch_table,
+                min_over_axis0_contig_atomic_dispatch_table,
+                min_over_axis1_contig_atomic_dispatch_table,
+                min_over_axis_strided_temps_dispatch_table,
+                min_over_axis0_contig_temps_dispatch_table,
+                min_over_axis1_contig_temps_dispatch_table,
+                min_atomic_support_vector);
+        };
+        m.def("_min_over_axis", min_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/min.hpp b/dpnp/tensor/libtensor/source/reductions/min.hpp
new file mode 100644
index 000000000000..e054f44539f3
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/min.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_min(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/prod.cpp b/dpnp/tensor/libtensor/source/reductions/prod.cpp
new file mode 100644
index 000000000000..9ecd403159b0
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/prod.cpp
@@ -0,0 +1,460 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    prod_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    prod_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    prod_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    prod_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    prod_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    prod_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+/* @brief Types supported by plus-reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProductReductionAtomic
+{
+
+    /* value if true a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        // input int8
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        // input uint8
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        // input int16
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        // input uint16
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProductReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+        td_ns::
+            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    sycl::half,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                               ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
+                                                    sycl::logical_and<dstTy>,
+                                                    sycl::multiplies<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
+                                                    sycl::logical_and<dstTy>,
+                                                    sycl::multiplies<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
+                                                    sycl::logical_and<dstTy>,
+                                                    sycl::multiplies<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_prod_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         ProductOverAxisAtomicStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(prod_over_axis_strided_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         ProductOverAxisTempsStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(prod_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis1AtomicContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(prod_over_axis1_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis0AtomicContigFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(prod_over_axis0_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(prod_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(prod_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t prod_atomic_support_vector[td_ns::num_types];
+
+void populate_prod_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::ProductAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, ProductAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(prod_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_prod(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_prod_over_axis_dispatch_tables;
+        populate_prod_over_axis_dispatch_tables();
+        using impl::prod_over_axis0_contig_atomic_dispatch_table;
+        using impl::prod_over_axis0_contig_temps_dispatch_table;
+        using impl::prod_over_axis1_contig_atomic_dispatch_table;
+        using impl::prod_over_axis1_contig_temps_dispatch_table;
+        using impl::prod_over_axis_strided_atomic_dispatch_table;
+        using impl::prod_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_prod_atomic_support_dispatch_vector;
+        populate_prod_atomic_support_dispatch_vector();
+        using impl::prod_atomic_support_vector;
+
+        auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                              const arrayT &dst, sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                prod_over_axis_strided_atomic_dispatch_table,
+                prod_over_axis0_contig_atomic_dispatch_table,
+                prod_over_axis1_contig_atomic_dispatch_table,
+                prod_over_axis_strided_temps_dispatch_table,
+                prod_over_axis0_contig_temps_dispatch_table,
+                prod_over_axis1_contig_temps_dispatch_table,
+                prod_atomic_support_vector);
+        };
+        m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto prod_dtype_supported =
+            [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
+                const std::string &dst_usm_type, sycl::queue &q) {
+                return py_reduction_dtype_supported(
+                    input_dtype, output_dtype, dst_usm_type, q,
+                    prod_over_axis_strided_atomic_dispatch_table,
+                    prod_over_axis_strided_temps_dispatch_table,
+                    prod_atomic_support_vector);
+            };
+        m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"),
+              py::arg("dst_usm_type"), py::arg("sycl_queue"));
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/prod.hpp b/dpnp/tensor/libtensor/source/reductions/prod.hpp
new file mode 100644
index 000000000000..15b1c07e5ddd
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/prod.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_prod(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp
new file mode 100644
index 000000000000..b8a042e9a55b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.cpp
@@ -0,0 +1,251 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    hypot_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    hypot_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    hypot_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForHypotReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct HypotOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForHypotReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = su_ns::Hypot<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct HypotOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForHypotReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = su_ns::Hypot<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct HypotOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForHypotReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = su_ns::Hypot<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_hypot_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         HypotOverAxisTempsStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(hypot_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         HypotOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(hypot_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         HypotOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(hypot_over_axis0_contig_temps_dispatch_table);
+}
+
+} // namespace impl
+
+void init_reduce_hypot(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_hypot_over_axis_dispatch_tables;
+        populate_hypot_over_axis_dispatch_tables();
+        using impl::hypot_over_axis0_contig_temps_dispatch_table;
+        using impl::hypot_over_axis1_contig_temps_dispatch_table;
+        using impl::hypot_over_axis_strided_temps_dispatch_table;
+
+        using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+        using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+
+        auto hypot_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                               const arrayT &dst, sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_tree_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                hypot_over_axis_strided_temps_dispatch_table,
+                hypot_over_axis0_contig_temps_dispatch_table,
+                hypot_over_axis1_contig_temps_dispatch_table);
+        };
+        m.def("_hypot_over_axis", hypot_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto hypot_dtype_supported = [&](const py::dtype &input_dtype,
+                                         const py::dtype &output_dtype) {
+            return py_tree_reduction_dtype_supported(
+                input_dtype, output_dtype,
+                hypot_over_axis_strided_temps_dispatch_table);
+        };
+        m.def("_hypot_over_axis_dtype_supported", hypot_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"));
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/reduce_hypot.hpp b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.hpp
new file mode 100644
index 000000000000..c0a16345af75
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/reduce_hypot.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_reduce_hypot(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
new file mode 100644
index 000000000000..af6c3f0d513a
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
@@ -0,0 +1,143 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+#include "utils/type_utils.hpp"
+
+namespace dpctl::tensor::py_internal::atomic_support
+{
+
+typedef bool (*atomic_support_fn_ptr_t)(const sycl::queue &, sycl::usm::alloc);
+
+/*! @brief Function which returns a constant value for atomic support */
+template <bool return_value>
+bool fixed_decision(const sycl::queue &, sycl::usm::alloc)
+{
+    return return_value;
+}
+
+/*! @brief Template for querying atomic support for a type on a device */
+template <typename T>
+bool check_atomic_support(const sycl::queue &exec_q,
+                          sycl::usm::alloc usm_alloc_type)
+{
+    static constexpr bool atomic32 = (sizeof(T) == 4);
+    static constexpr bool atomic64 = (sizeof(T) == 8);
+    using dpctl::tensor::type_utils::is_complex;
+    if constexpr ((!atomic32 && !atomic64) || is_complex<T>::value) {
+        return fixed_decision<false>(exec_q, usm_alloc_type);
+    }
+    else {
+        bool supports_atomics = false;
+        const sycl::device &dev = exec_q.get_device();
+        if constexpr (atomic64) {
+            if (!dev.has(sycl::aspect::atomic64)) {
+                return false;
+            }
+        }
+        switch (usm_alloc_type) {
+        case sycl::usm::alloc::shared:
+            supports_atomics =
+                dev.has(sycl::aspect::usm_atomic_shared_allocations);
+            break;
+        case sycl::usm::alloc::host:
+            supports_atomics =
+                dev.has(sycl::aspect::usm_atomic_host_allocations);
+            break;
+        case sycl::usm::alloc::device:
+            supports_atomics = true;
+            break;
+        default:
+            supports_atomics = false;
+        }
+        return supports_atomics;
+    }
+}
+
+template <typename fnT, typename T>
+struct ArithmeticAtomicSupportFactory
+{
+    fnT get()
+    {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (std::is_floating_point_v<T> ||
+                      std::is_same_v<T, sycl::half> || is_complex<T>::value) {
+            // for real- and complex- floating point types, tree reduction has
+            // better round-off accumulation properties (round-off error is
+            // proportional to the log2(reduction_size), while naive elementwise
+            // summation used by atomic implementation has round-off error
+            // growing proportional to the reduction_size.), hence reduction
+            // over floating point types should always use tree_reduction
+            // algorithm, even though atomic implementation may be applicable
+            return fixed_decision<false>;
+        }
+        else {
+            return check_atomic_support<T>;
+        }
+    }
+};
+
+template <typename fnT, typename T>
+struct MinMaxAtomicSupportFactory
+{
+    fnT get() { return check_atomic_support<T>; }
+};
+
+template <typename fnT, typename T>
+struct MaxAtomicSupportFactory : public MinMaxAtomicSupportFactory<fnT, T>
+{
+};
+
+template <typename fnT, typename T>
+struct MinAtomicSupportFactory : public MinMaxAtomicSupportFactory<fnT, T>
+{
+};
+
+template <typename fnT, typename T>
+struct SumAtomicSupportFactory : public ArithmeticAtomicSupportFactory<fnT, T>
+{
+};
+
+template <typename fnT, typename T>
+struct ProductAtomicSupportFactory
+    : public ArithmeticAtomicSupportFactory<fnT, T>
+{
+};
+
+} // namespace dpctl::tensor::py_internal::atomic_support
diff --git a/dpnp/tensor/libtensor/source/reductions/reduction_common.cpp b/dpnp/tensor/libtensor/source/reductions/reduction_common.cpp
new file mode 100644
index 000000000000..fca5e09e2fe5
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/reduction_common.cpp
@@ -0,0 +1,69 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "all.hpp"
+#include "any.hpp"
+#include "argmax.hpp"
+#include "argmin.hpp"
+#include "logsumexp.hpp"
+#include "max.hpp"
+#include "min.hpp"
+#include "prod.hpp"
+#include "reduce_hypot.hpp"
+#include "sum.hpp"
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+/*! @brief Add reduction functions to Python module */
+void init_reduction_functions(py::module_ m)
+{
+    init_all(m);
+    init_any(m);
+    init_argmax(m);
+    init_argmin(m);
+    init_logsumexp(m);
+    init_max(m);
+    init_min(m);
+    init_prod(m);
+    init_reduce_hypot(m);
+    init_sum(m);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/reduction_common.hpp b/dpnp/tensor/libtensor/source/reductions/reduction_common.hpp
new file mode 100644
index 000000000000..4df67c16bc4e
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/reduction_common.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_reduction_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp
new file mode 100644
index 000000000000..8224163ccb19
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/reduction_over_axis.hpp
@@ -0,0 +1,1307 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension, specifically functions for reductions.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <exception>
+#include <iterator>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/reductions.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+/* ====================== dtype supported ======================== */
+
+/*! @brief Template implementing Python API for querying type support by
+ * reduction which may support atomics */
+template <typename fnT, typename CheckAtomicSupportFnT>
+bool py_reduction_dtype_supported(
+    const py::dtype &input_dtype,
+    const py::dtype &output_dtype,
+    const std::string &dst_usm_type,
+    sycl::queue &q,
+    const fnT &atomic_dispatch_table,
+    const fnT &temps_dispatch_table,
+    const CheckAtomicSupportFnT &check_atomic_support)
+{
+    int arg_tn =
+        input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int out_tn =
+        output_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int arg_typeid = -1;
+    int out_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
+        out_typeid = array_types.typenum_to_lookup_id(out_tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
+        out_typeid >= td_ns::num_types) {
+        throw std::runtime_error("Reduction type support check: lookup failed");
+    }
+
+    // remove_all_extents gets underlying type of table
+    using fn_ptrT = typename std::remove_all_extents<fnT>::type;
+    fn_ptrT fn = nullptr;
+
+    sycl::usm::alloc kind = sycl::usm::alloc::unknown;
+
+    if (dst_usm_type == "device") {
+        kind = sycl::usm::alloc::device;
+    }
+    else if (dst_usm_type == "shared") {
+        kind = sycl::usm::alloc::shared;
+    }
+    else if (dst_usm_type == "host") {
+        kind = sycl::usm::alloc::host;
+    }
+    else {
+        throw py::value_error("Unrecognized `dst_usm_type` argument.");
+    }
+
+    bool supports_atomics = check_atomic_support[out_typeid](q, kind);
+
+    if (supports_atomics) {
+        fn = atomic_dispatch_table[arg_typeid][out_typeid];
+    }
+
+    if (fn == nullptr) {
+        // use slower reduction implementation using temporaries
+        fn = temps_dispatch_table[arg_typeid][out_typeid];
+    }
+
+    return (fn != nullptr);
+}
+
+/*! @brief Template implementing Python API for querying type support by tree
+ * reduction */
+template <typename fnT>
+bool py_tree_reduction_dtype_supported(const py::dtype &input_dtype,
+                                       const py::dtype &output_dtype,
+                                       const fnT &temps_dispatch_table)
+{
+    int arg_tn =
+        input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int out_tn =
+        output_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int arg_typeid = -1;
+    int out_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
+        out_typeid = array_types.typenum_to_lookup_id(out_tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
+        out_typeid >= td_ns::num_types) {
+        throw std::runtime_error("Reduction type support check: lookup failed");
+    }
+
+    auto fn = temps_dispatch_table[arg_typeid][out_typeid];
+
+    return (fn != nullptr);
+}
+
+/* ==================== Generic reductions ====================== */
+
+/*! @brief Template implementing Python API for reduction over axis which may
+ * support atomics */
+template <typename strided_fnT, typename contig_fnT, typename SupportAtomicFnT>
+std::pair<sycl::event, sycl::event> py_reduction_over_axis(
+    const dpctl::tensor::usm_ndarray &src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends,
+    const strided_fnT &atomic_dispatch_table,
+    const contig_fnT &axis0_atomic_dispatch_table,
+    const contig_fnT &axis1_atomic_dispatch_table,
+    const strided_fnT &temps_dispatch_table,
+    const contig_fnT &axis0_temps_dispatch_table,
+    const contig_fnT &axis1_temps_dispatch_table,
+    const SupportAtomicFnT &check_atomic_support)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    std::size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    void *data_ptr = dst.get_data();
+    const auto &ctx = exec_q.get_context();
+    auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
+
+    bool supports_atomics = check_atomic_support[dst_typeid](exec_q, usm_type);
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    if ((is_src_c_contig && is_dst_c_contig) ||
+        (is_src_f_contig && dst_nelems == 1)) {
+        // remove_all_extents gets underlying type of table
+        using contig_fn_ptr_T =
+            typename std::remove_all_extents<contig_fnT>::type;
+        contig_fn_ptr_T fn;
+        if (supports_atomics) {
+            fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid];
+        }
+        else {
+            fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+        }
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+    else if (is_src_f_contig &&
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) {
+        // remove_all_extents gets underlying type of table
+        using contig_fn_ptr_T =
+            typename std::remove_all_extents<contig_fnT>::type;
+        contig_fn_ptr_T fn;
+        if (supports_atomics) {
+            fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid];
+        }
+        else {
+            fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+        }
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+
+    // TODO: not used anywhere
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT simplified_reduction_shape;
+    shT simplified_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    simplify_iteration_space_1(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        simplified_reduction_shape, simplified_reduction_src_strides,
+        reduction_src_offset);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    if ((reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        std::size_t iter_nelems = dst_nelems;
+
+        if (simplified_reduction_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<std::size_t>(
+                     simplified_iteration_src_strides[0]) == reduction_nelems);
+        }
+        else if (static_cast<std::size_t>(
+                     simplified_reduction_src_strides[0]) == iter_nelems) {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            using contig_fn_ptr_T =
+                typename std::remove_all_extents<contig_fnT>::type;
+            contig_fn_ptr_T fn;
+            if (supports_atomics) {
+                fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid];
+            }
+            else {
+                fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+            }
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            using contig_fn_ptr_T =
+                typename std::remove_all_extents<contig_fnT>::type;
+            contig_fn_ptr_T fn;
+            if (supports_atomics) {
+                fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid];
+            }
+            else {
+                fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+            }
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis0_contig_ev);
+            }
+        }
+    }
+
+    // remove_all_extents gets underlying type of table
+    using strided_fn_ptr_T =
+        typename std::remove_all_extents<strided_fnT>::type;
+    strided_fn_ptr_T fn = nullptr;
+
+    if (supports_atomics) {
+        fn = atomic_dispatch_table[src_typeid][dst_typeid];
+    }
+
+    if (fn == nullptr) {
+        // use slower reduction implementation using temporaries
+        fn = temps_dispatch_table[src_typeid][dst_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error("Datatypes are not supported");
+        }
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            simplified_reduction_shape, simplified_reduction_src_strides);
+    auto tmp_alloc_owner =
+        std::move(std::get<0>(arrays_metainfo_packing_triple_));
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+    const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    const py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto reduction_ev =
+        fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(),
+           iteration_nd, iter_shape_and_strides, iteration_src_offset,
+           iteration_dst_offset,
+           reduction_nd, // number dimensions being reduced
+           reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {reduction_ev}, tmp_alloc_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, reduction_ev);
+}
+
+/* ================= No atomic reductions ====================== */
+
+/*! @brief Template implementing Python API for reduction over axis without
+ * atomics */
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
+    const dpctl::tensor::usm_ndarray &src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends,
+    const strided_fnT &temps_dispatch_table,
+    const contig_fnT &axis0_temps_dispatch_table,
+    const contig_fnT &axis1_temps_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    std::size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    if ((is_src_c_contig && is_dst_c_contig) ||
+        (is_src_f_contig && dst_nelems == 1)) {
+        auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+    else if (is_src_f_contig &&
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) {
+        auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT simplified_reduction_shape;
+    shT simplified_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    simplify_iteration_space_1(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        simplified_reduction_shape, simplified_reduction_src_strides,
+        reduction_src_offset);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    if ((reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        std::size_t iter_nelems = dst_nelems;
+
+        if (simplified_reduction_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<std::size_t>(
+                     simplified_iteration_src_strides[0]) == reduction_nelems);
+        }
+        else if (static_cast<std::size_t>(
+                     simplified_reduction_src_strides[0]) == iter_nelems) {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis0_contig_ev);
+            }
+        }
+    }
+
+    auto fn = temps_dispatch_table[src_typeid][dst_typeid];
+    if (fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            simplified_reduction_shape, simplified_reduction_src_strides);
+    auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_));
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+    const py::ssize_t *temp_allocation_ptr = tmp_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    const py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto reduction_ev =
+        fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(),
+           iteration_nd, iter_shape_and_strides, iteration_src_offset,
+           iteration_dst_offset,
+           reduction_nd, // number dimensions being reduced
+           reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {reduction_ev}, tmp_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, reduction_ev);
+}
+
+/*! @brief Template implementing Python API for searching over an axis */
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event> py_search_over_axis(
+    const dpctl::tensor::usm_ndarray &src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends,
+    const strided_fnT &strided_dispatch_table,
+    const contig_fnT &axis0_contig_dispatch_table,
+    const contig_fnT &axis1_contig_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    std::size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+    else if (is_src_f_contig && dst_nd == 1) {
+        auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid];
+        if (fn != nullptr) {
+            std::size_t iter_nelems = dst_nelems;
+
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event reduction_over_axis_contig_ev =
+                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                   dst.get_data(),
+                   zero_offset, // iteration_src_offset
+                   zero_offset, // iteration_dst_offset
+                   zero_offset, // reduction_src_offset
+                   depends);
+
+            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+            return std::make_pair(keep_args_event,
+                                  reduction_over_axis_contig_ev);
+        }
+    }
+
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT compact_reduction_shape;
+    shT compact_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    // TODO: not used anywhere
+    compact_iteration_space(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        compact_reduction_shape, compact_reduction_src_strides);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    if ((reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        std::size_t iter_nelems = dst_nelems;
+
+        if (compact_reduction_src_strides[0] == 1) {
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<std::size_t>(
+                     simplified_iteration_src_strides[0]) == reduction_nelems);
+        }
+        else if (static_cast<std::size_t>(compact_reduction_src_strides[0]) ==
+                 iter_nelems) {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1) {
+            auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis0_contig_ev);
+            }
+        }
+    }
+
+    auto fn = strided_dispatch_table[src_typeid][dst_typeid];
+    if (fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    auto arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            compact_reduction_shape, compact_reduction_src_strides);
+    auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_));
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+    const py::ssize_t *temp_allocation_ptr = tmp_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    const py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(),
+                      dst.get_data(), iteration_nd, iter_shape_and_strides,
+                      iteration_src_offset, iteration_dst_offset,
+                      reduction_nd, // number dimensions being reduced
+                      reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {comp_ev}, tmp_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, comp_ev);
+}
+
+/* ================= Atomic only reductions ====================== */
+
+/*! @brief Template implementing Python API for boolean reductions over an axis
+ */
+template <typename contig_dispatchT,
+          typename strided_dispatchT,
+          typename atomic_support_fnT>
+std::pair<sycl::event, sycl::event>
+    py_boolean_reduction(const dpctl::tensor::usm_ndarray &src,
+                         int trailing_dims_to_reduce,
+                         const dpctl::tensor::usm_ndarray &dst,
+                         sycl::queue &exec_q,
+                         const std::vector<sycl::event> &depends,
+                         const contig_dispatchT &axis1_contig_dispatch_vector,
+                         const contig_dispatchT &axis0_contig_dispatch_vector,
+                         const strided_dispatchT &strided_dispatch_vector,
+                         const atomic_support_fnT check_atomic_support)
+{
+    int src_nd = src.get_ndim();
+    int iter_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iter_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iter_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t dst_nelems = dst.get_size();
+
+    std::size_t red_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        red_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(dst, src)) {
+        throw py::value_error("Arrays are expected to have no memory overlap");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    static constexpr int int32_typeid =
+        static_cast<int>(td_ns::typenum_t::INT32);
+    if (dst_typeid != int32_typeid) {
+        throw py::value_error(
+            "Unexpected data type of destination array, expecting 'int32'");
+    }
+
+    void *data_ptr = dst.get_data();
+    const auto &ctx = exec_q.get_context();
+    auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
+
+    bool supports_atomics = check_atomic_support(exec_q, usm_type);
+    if (!supports_atomics) {
+        throw py::value_error(
+            "This reduction is not supported for this device and usm_type.");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_src_f_contig = src.is_f_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    // TODO: should be dst_nelems == 0?
+    if ((is_src_c_contig && is_dst_c_contig) ||
+        (is_src_f_contig && dst_nelems == 0)) {
+        auto fn = axis1_contig_dispatch_vector[src_typeid];
+        static constexpr py::ssize_t zero_offset = 0;
+
+        sycl::event red_ev =
+            fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset,
+               zero_offset, zero_offset, depends);
+
+        sycl::event keep_args_event =
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+        return std::make_pair(keep_args_event, red_ev);
+    }
+    else if (is_src_f_contig &&
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) {
+        auto fn = axis0_contig_dispatch_vector[src_typeid];
+        static constexpr py::ssize_t zero_offset = 0;
+
+        sycl::event red_ev =
+            fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset,
+               zero_offset, zero_offset, depends);
+
+        sycl::event keep_args_event =
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+        return std::make_pair(keep_args_event, red_ev);
+    }
+
+    auto src_shape_vecs = src.get_shape_vector();
+    auto src_strides_vecs = src.get_strides_vector();
+    auto dst_strides_vecs = dst.get_strides_vector();
+
+    int simplified_red_nd = trailing_dims_to_reduce;
+
+    using shT = std::vector<py::ssize_t>;
+    shT red_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                        std::end(src_strides_vecs));
+
+    shT simplified_red_shape;
+    shT simplified_red_src_strides;
+    py::ssize_t red_src_offset(0);
+
+    simplify_iteration_space_1(
+        simplified_red_nd, src_shape_ptr + dst_nd, red_src_strides,
+        // output
+        simplified_red_shape, simplified_red_src_strides, red_src_offset);
+
+    shT iter_src_strides(std::begin(src_strides_vecs),
+                         std::begin(src_strides_vecs) + iter_nd);
+    shT const &iter_dst_strides = dst_strides_vecs;
+
+    shT simplified_iter_shape;
+    shT simplified_iter_src_strides;
+    shT simplified_iter_dst_strides;
+    py::ssize_t iter_src_offset(0);
+    py::ssize_t iter_dst_offset(0);
+
+    if (iter_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iter_nd = 1;
+        simplified_iter_shape.push_back(1);
+        simplified_iter_src_strides.push_back(0);
+        simplified_iter_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
+            // output
+            simplified_iter_shape, simplified_iter_src_strides,
+            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
+    }
+
+    if (simplified_red_nd == 1 && iter_nd == 1) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        std::size_t iter_nelems = dst_nelems;
+
+        if (simplified_red_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iter_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iter_dst_strides[0] == 1) &&
+                (static_cast<std::size_t>(simplified_iter_src_strides[0]) ==
+                 red_nelems);
+        }
+        else if (static_cast<std::size_t>(simplified_red_src_strides[0]) ==
+                 iter_nelems) {
+            mat_reduce_over_axis0 = (simplified_iter_dst_strides[0] == 1) &&
+                                    (simplified_iter_src_strides[0] == 1);
+        }
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            auto fn = axis1_contig_dispatch_vector[src_typeid];
+
+            sycl::event red_ev =
+                fn(exec_q, iter_nelems, red_nelems, src_data, dst_data,
+                   iter_src_offset, iter_dst_offset, red_src_offset, depends);
+
+            sycl::event keep_args_event =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+            return std::make_pair(keep_args_event, red_ev);
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = axis0_contig_dispatch_vector[src_typeid];
+
+            sycl::event red_ev =
+                fn(exec_q, iter_nelems, red_nelems, src_data, dst_data,
+                   iter_src_offset, iter_dst_offset, red_src_offset, depends);
+
+            sycl::event keep_args_event =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+            return std::make_pair(keep_args_event, red_ev);
+        }
+    }
+
+    auto fn = strided_dispatch_vector[src_typeid];
+
+    std::vector<sycl::event> host_task_events{};
+    auto iter_red_metadata_packing_triple_ =
+        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_iter_shape,
+            simplified_iter_src_strides, simplified_iter_dst_strides,
+            simplified_red_shape, simplified_red_src_strides);
+    auto packed_shapes_strides_owner =
+        std::move(std::get<0>(iter_red_metadata_packing_triple_));
+    const auto &copy_metadata_ev =
+        std::get<2>(iter_red_metadata_packing_triple_);
+    const py::ssize_t *packed_shapes_and_strides =
+        packed_shapes_strides_owner.get();
+
+    const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
+    const py::ssize_t *red_shape_stride =
+        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto red_ev =
+        fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, iter_nd,
+           iter_shape_and_strides, iter_src_offset, iter_dst_offset,
+           simplified_red_nd, red_shape_stride, red_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {red_ev}, packed_shapes_strides_owner);
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, red_ev);
+}
+
+extern void init_reduction_functions(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/sum.cpp b/dpnp/tensor/libtensor/source/reductions/sum.cpp
new file mode 100644
index 000000000000..9a0d212ed8da
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/sum.cpp
@@ -0,0 +1,460 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <complex>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "utils/type_dispatch_building.hpp"
+
+#include "reduction_atomic_support.hpp"
+#include "reduction_over_axis.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    sum_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    sum_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    sum_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+/* @brief Types supported by plus-reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumReductionAtomic
+{
+
+    /* value if true a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        // input int8
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        // input uint8
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        // input int16
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        // input uint16
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+
+        // input uint64_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+        td_ns::
+            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    sycl::half,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                               ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT =
+                std::conditional_t<std::is_same_v<dstTy, bool>,
+                                   sycl::logical_or<dstTy>, sycl::plus<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis1TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT =
+                std::conditional_t<std::is_same_v<dstTy, bool>,
+                                   sycl::logical_or<dstTy>, sycl::plus<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis0TempsContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT =
+                std::conditional_t<std::is_same_v<dstTy, bool>,
+                                   sycl::logical_or<dstTy>, sycl::plus<dstTy>>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
+                                                             ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void populate_sum_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         SumOverAxisAtomicStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         SumOverAxisTempsStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis1AtomicContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis0AtomicContigFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis1TempsContigFactory, td_ns::num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(sum_over_axis1_contig_temps_dispatch_table);
+
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis0TempsContigFactory, td_ns::num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(sum_over_axis0_contig_temps_dispatch_table);
+}
+
+using atomic_support::atomic_support_fn_ptr_t;
+static atomic_support_fn_ptr_t sum_atomic_support_vector[td_ns::num_types];
+
+void populate_sum_atomic_support_dispatch_vector(void)
+{
+    using td_ns::DispatchVectorBuilder;
+
+    using atomic_support::SumAtomicSupportFactory;
+    DispatchVectorBuilder<atomic_support_fn_ptr_t, SumAtomicSupportFactory,
+                          td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(sum_atomic_support_vector);
+}
+
+} // namespace impl
+
+void init_sum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        using impl::populate_sum_over_axis_dispatch_tables;
+        populate_sum_over_axis_dispatch_tables();
+        using impl::sum_over_axis0_contig_atomic_dispatch_table;
+        using impl::sum_over_axis0_contig_temps_dispatch_table;
+        using impl::sum_over_axis1_contig_atomic_dispatch_table;
+        using impl::sum_over_axis1_contig_temps_dispatch_table;
+        using impl::sum_over_axis_strided_atomic_dispatch_table;
+        using impl::sum_over_axis_strided_temps_dispatch_table;
+
+        using impl::populate_sum_atomic_support_dispatch_vector;
+        populate_sum_atomic_support_dispatch_vector();
+        using impl::sum_atomic_support_vector;
+
+        auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                sum_over_axis_strided_atomic_dispatch_table,
+                sum_over_axis0_contig_atomic_dispatch_table,
+                sum_over_axis1_contig_atomic_dispatch_table,
+                sum_over_axis_strided_temps_dispatch_table,
+                sum_over_axis0_contig_temps_dispatch_table,
+                sum_over_axis1_contig_temps_dispatch_table,
+                sum_atomic_support_vector);
+        };
+        m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sum_dtype_supported =
+            [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
+                const std::string &dst_usm_type, sycl::queue &q) {
+                return py_reduction_dtype_supported(
+                    input_dtype, output_dtype, dst_usm_type, q,
+                    sum_over_axis_strided_atomic_dispatch_table,
+                    sum_over_axis_strided_temps_dispatch_table,
+                    sum_atomic_support_vector);
+            };
+        m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"),
+              py::arg("dst_usm_type"), py::arg("sycl_queue"));
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/reductions/sum.hpp b/dpnp/tensor/libtensor/source/reductions/sum.hpp
new file mode 100644
index 000000000000..08add902a049
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/reductions/sum.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_sum(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/repeat.cpp b/dpnp/tensor/libtensor/source/repeat.cpp
new file mode 100644
index 000000000000..b809160e257b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/repeat.cpp
@@ -0,0 +1,819 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/repeat.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::repeat::repeat_by_sequence_fn_ptr_t;
+static repeat_by_sequence_fn_ptr_t
+    repeat_by_sequence_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::repeat::repeat_by_sequence_1d_fn_ptr_t;
+static repeat_by_sequence_1d_fn_ptr_t
+    repeat_by_sequence_1d_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::repeat::repeat_by_scalar_fn_ptr_t;
+static repeat_by_scalar_fn_ptr_t
+    repeat_by_scalar_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::repeat::repeat_by_scalar_1d_fn_ptr_t;
+static repeat_by_scalar_1d_fn_ptr_t
+    repeat_by_scalar_1d_dispatch_vector[td_ns::num_types];
+
+void init_repeat_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::repeat::RepeatSequenceFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_sequence_fn_ptr_t,
+                                 RepeatSequenceFactory, td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(repeat_by_sequence_dispatch_vector);
+
+    using dpctl::tensor::kernels::repeat::RepeatSequence1DFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_sequence_1d_fn_ptr_t,
+                                 RepeatSequence1DFactory, td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(repeat_by_sequence_1d_dispatch_vector);
+
+    using dpctl::tensor::kernels::repeat::RepeatScalarFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_scalar_fn_ptr_t, RepeatScalarFactory,
+                                 td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(repeat_by_scalar_dispatch_vector);
+
+    using dpctl::tensor::kernels::repeat::RepeatScalar1DFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_scalar_1d_fn_ptr_t,
+                                 RepeatScalar1DFactory, td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(repeat_by_scalar_1d_dispatch_vector);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                          const dpctl::tensor::usm_ndarray &dst,
+                          const dpctl::tensor::usm_ndarray &reps,
+                          const dpctl::tensor::usm_ndarray &cumsum,
+                          int axis,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) ||
+        (axis > 0 && src_nd == 0)) {
+        throw py::value_error("Specified axis is invalid.");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    int reps_nd = reps.get_ndim();
+    if (reps_nd != 1) {
+        throw py::value_error("`reps` array must be 1-dimensional");
+    }
+
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("`cumsum` array must be 1-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {src, reps, cumsum, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t reps_sz = reps.get_size();
+    std::size_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool same_orthog_dims(true);
+    std::size_t orthog_nelems(1); // number of orthogonal iterations
+    for (auto i = 0; i < axis; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+    for (auto i = axis + 1; i < src_nd; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+
+    std::size_t src_axis_nelems(1);
+    if (src_nd > 0) {
+        src_axis_nelems = src_shape[axis];
+    }
+    std::size_t dst_axis_nelems(dst_shape[axis]);
+
+    // shape at repeated axis must be equal to the sum of reps
+    if (!same_orthog_dims || src_axis_nelems != reps_sz ||
+        src_axis_nelems != cumsum_sz) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (orthog_nelems == 0 || src_axis_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, orthog_nelems * dst_axis_nelems);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src or reps
+    if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int reps_typenum = reps.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Unexpected data type of `cumsum` array, expecting "
+            "'int64'");
+    }
+
+    if (reps_typeid != cumsum_typeid) {
+        throw py::value_error("`reps` array must have the same elemental "
+                              "data type as cumsum");
+    }
+
+    const char *src_data_p = src.get_data();
+    const char *reps_data_p = reps.get_data();
+    const char *cumsum_data_p = cumsum.get_data();
+    char *dst_data_p = dst.get_data();
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    auto reps_shape_vec = reps.get_shape_vector();
+    auto reps_strides_vec = reps.get_strides_vector();
+
+    sycl::event repeat_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis == 0 && src_nd < 2) {
+        // empty orthogonal directions
+
+        auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid];
+
+        assert(dst_shape_vec.size() == 1);
+        assert(dst_strides_vec.size() == 1);
+
+        if (src_nd == 0) {
+            src_shape_vec = {0};
+            src_strides_vec = {0};
+        }
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, src_shape_vec, src_strides_vec);
+        auto packed_src_shape_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_src_shape_strides =
+            packed_src_shape_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev =
+            fn(exec_q, src_axis_nelems, src_data_p, dst_data_p, reps_data_p,
+               cumsum_data_p, src_nd, packed_src_shape_strides,
+               dst_shape_vec[0], dst_strides_vec[0], reps_shape_vec[0],
+               reps_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {repeat_ev}, packed_src_shape_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+    else {
+        // non-empty orthogonal directions
+
+        auto fn = repeat_by_sequence_dispatch_vector[src_typeid];
+
+        int orthog_nd = src_nd - 1;
+
+        using shT = std::vector<py::ssize_t>;
+        shT orthog_src_shape;
+        shT orthog_src_strides;
+        shT axis_src_shape;
+        shT axis_src_stride;
+        split_iteration_space(src_shape_vec, src_strides_vec, axis, axis + 1,
+                              orthog_src_shape, axis_src_shape,
+                              orthog_src_strides, axis_src_stride);
+
+        shT orthog_dst_shape;
+        shT orthog_dst_strides;
+        shT axis_dst_shape;
+        shT axis_dst_stride;
+        split_iteration_space(dst_shape_vec, dst_strides_vec, axis, axis + 1,
+                              orthog_dst_shape, axis_dst_shape,
+                              orthog_dst_strides, axis_dst_stride);
+
+        assert(orthog_src_shape.size() == static_cast<std::size_t>(orthog_nd));
+        assert(orthog_dst_shape.size() == static_cast<std::size_t>(orthog_nd));
+        assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(),
+                          orthog_dst_shape.begin()));
+
+        shT simplified_orthog_shape;
+        shT simplified_orthog_src_strides;
+        shT simplified_orthog_dst_strides;
+
+        const py::ssize_t *_shape = orthog_src_shape.data();
+
+        py::ssize_t orthog_src_offset(0);
+        py::ssize_t orthog_dst_offset(0);
+        simplify_iteration_space(
+            orthog_nd, _shape, orthog_src_strides, orthog_dst_strides,
+            // output
+            simplified_orthog_shape, simplified_orthog_src_strides,
+            simplified_orthog_dst_strides, orthog_src_offset,
+            orthog_dst_offset);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_orthog_shape,
+            simplified_orthog_src_strides, simplified_orthog_dst_strides);
+        auto packed_shapes_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_shapes_strides =
+            packed_shapes_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev = fn(exec_q, orthog_nelems, src_axis_nelems, src_data_p,
+                       dst_data_p, reps_data_p, cumsum_data_p,
+                       // data to build orthog indexer
+                       orthog_nd, packed_shapes_strides, orthog_src_offset,
+                       orthog_dst_offset,
+                       // data to build indexers along repeated axis in src
+                       axis_src_shape[0], axis_src_stride[0],
+                       // data to build indexer along repeated axis in dst
+                       axis_dst_shape[0], axis_dst_stride[0],
+                       // data to build indexer for reps array
+                       reps_shape_vec[0], reps_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {repeat_ev}, packed_shapes_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {src, reps, cumsum, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                          const dpctl::tensor::usm_ndarray &dst,
+                          const dpctl::tensor::usm_ndarray &reps,
+                          const dpctl::tensor::usm_ndarray &cumsum,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends)
+{
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != 1) {
+        throw py::value_error(
+            "`dst` array must be 1-dimensional when repeating a full array");
+    }
+
+    int reps_nd = reps.get_ndim();
+    if (reps_nd != 1) {
+        throw py::value_error("`reps` array must be 1-dimensional");
+    }
+
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("`cumsum` array must be 1-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {src, reps, cumsum, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t src_sz = src.get_size();
+    std::size_t reps_sz = reps.get_size();
+    std::size_t cumsum_sz = cumsum.get_size();
+
+    // shape at repeated axis must be equal to the sum of reps
+    if (src_sz != reps_sz || src_sz != cumsum_sz) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (src_sz == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst,
+                                                               dst.get_size());
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src, cumsum, or reps
+    if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int reps_typenum = reps.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    static constexpr int int64_typeid =
+        static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Unexpected data type of `cumsum` array, expecting "
+            "'int64'");
+    }
+
+    if (reps_typeid != cumsum_typeid) {
+        throw py::value_error("`reps` array must have the same elemental "
+                              "data type as cumsum");
+    }
+
+    const char *src_data_p = src.get_data();
+    const char *reps_data_p = reps.get_data();
+    const char *cumsum_data_p = cumsum.get_data();
+    char *dst_data_p = dst.get_data();
+
+    int src_nd = src.get_ndim();
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+    if (src_nd == 0) {
+        src_shape_vec = {0};
+        src_strides_vec = {0};
+    }
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    auto reps_shape_vec = reps.get_shape_vector();
+    auto reps_strides_vec = reps.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events{};
+
+    auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid];
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, src_shape_vec, src_strides_vec);
+    auto packed_src_shapes_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple1));
+    sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+    const py::ssize_t *packed_src_shapes_strides =
+        packed_src_shapes_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shapes_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event repeat_ev = fn(
+        exec_q, src_sz, src_data_p, dst_data_p, reps_data_p, cumsum_data_p,
+        src_nd, packed_src_shapes_strides, dst_shape_vec[0], dst_strides_vec[0],
+        reps_shape_vec[0], reps_strides_vec[0], all_deps);
+
+    sycl::event cleanup_tmp_allocations_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {repeat_ev}, packed_src_shapes_strides_owner);
+    host_task_events.push_back(cleanup_tmp_allocations_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {src, reps, cumsum, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        const py::ssize_t reps,
+                        int axis,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) ||
+        (axis > 0 && src_nd == 0)) {
+        throw py::value_error("Specified axis is invalid.");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool same_orthog_dims(true);
+    std::size_t orthog_nelems(1); // number of orthogonal iterations
+    for (auto i = 0; i < axis; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+    for (auto i = axis + 1; i < src_nd; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+
+    std::size_t src_axis_nelems(1);
+    if (src_nd > 0) {
+        src_axis_nelems = src_shape[axis];
+    }
+    std::size_t dst_axis_nelems(dst_shape[axis]);
+
+    // shape at repeated axis must be equal to the shape of src at the axis *
+    // reps
+    if (!same_orthog_dims || (src_axis_nelems * reps) != dst_axis_nelems) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (orthog_nelems == 0 || src_axis_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, orthog_nelems * (src_axis_nelems * reps));
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src
+    if (overlap(dst, src)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    const char *src_data_p = src.get_data();
+    char *dst_data_p = dst.get_data();
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    sycl::event repeat_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis == 0 && src_nd < 2) {
+        // empty orthogonal directions
+
+        auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid];
+
+        assert(dst_shape_vec.size() == 1);
+        assert(dst_strides_vec.size() == 1);
+
+        if (src_nd == 0) {
+            src_shape_vec = {0};
+            src_strides_vec = {0};
+        }
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, src_shape_vec, src_strides_vec);
+        auto packed_src_shape_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_src_shape_strides =
+            packed_src_shape_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev = fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps,
+                       src_nd, packed_src_shape_strides, dst_shape_vec[0],
+                       dst_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {repeat_ev}, packed_src_shape_strides_owner);
+
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+    else {
+        // non-empty orthogonal directions
+
+        auto fn = repeat_by_scalar_dispatch_vector[src_typeid];
+
+        int orthog_nd = src_nd - 1;
+
+        using shT = std::vector<py::ssize_t>;
+        shT orthog_src_shape;
+        shT orthog_src_strides;
+        shT axis_src_shape;
+        shT axis_src_stride;
+        split_iteration_space(src_shape_vec, src_strides_vec, axis, axis + 1,
+                              orthog_src_shape, axis_src_shape,
+                              orthog_src_strides, axis_src_stride);
+
+        shT orthog_dst_shape;
+        shT orthog_dst_strides;
+        shT axis_dst_shape;
+        shT axis_dst_stride;
+        split_iteration_space(dst_shape_vec, dst_strides_vec, axis, axis + 1,
+                              orthog_dst_shape, axis_dst_shape,
+                              orthog_dst_strides, axis_dst_stride);
+
+        assert(orthog_src_shape.size() == static_cast<std::size_t>(orthog_nd));
+        assert(orthog_dst_shape.size() == static_cast<std::size_t>(orthog_nd));
+        assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(),
+                          orthog_dst_shape.begin()));
+
+        shT simplified_orthog_shape;
+        shT simplified_orthog_src_strides;
+        shT simplified_orthog_dst_strides;
+
+        const py::ssize_t *_shape = orthog_src_shape.data();
+
+        py::ssize_t orthog_src_offset(0);
+        py::ssize_t orthog_dst_offset(0);
+
+        simplify_iteration_space(
+            orthog_nd, _shape, orthog_src_strides, orthog_dst_strides,
+            // output
+            simplified_orthog_shape, simplified_orthog_src_strides,
+            simplified_orthog_dst_strides, orthog_src_offset,
+            orthog_dst_offset);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_orthog_shape,
+            simplified_orthog_src_strides, simplified_orthog_dst_strides);
+        auto packed_shapes_strides_owner =
+            std::move(std::get<0>(ptr_size_event_tuple1));
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+        const py::ssize_t *packed_shapes_strides =
+            packed_shapes_strides_owner.get();
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev = fn(exec_q, orthog_nelems, dst_axis_nelems, src_data_p,
+                       dst_data_p, reps,
+                       // data to build orthog indexer
+                       orthog_nd, packed_shapes_strides, orthog_src_offset,
+                       orthog_dst_offset,
+                       // data to build indexer along repeated axis in src
+                       axis_src_shape[0], axis_src_stride[0],
+                       // data to build indexer along repeated axis in dst
+                       axis_dst_shape[0], axis_dst_stride[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {repeat_ev}, packed_shapes_strides_owner);
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    sycl::event py_obj_management_host_task_ev =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+    py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        const py::ssize_t reps,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends)
+{
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != 1) {
+        throw py::value_error(
+            "`dst` array must be 1-dimensional when repeating a full array");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    std::size_t src_sz = src.get_size();
+    std::size_t dst_sz = dst.get_size();
+
+    // shape at repeated axis must be equal to the shape of src at the axis *
+    // reps
+    if ((src_sz * reps) != dst_sz) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (src_sz == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst,
+                                                               src_sz * reps);
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src
+    if (overlap(dst, src)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    const char *src_data_p = src.get_data();
+    char *dst_data_p = dst.get_data();
+
+    int src_nd = src.get_ndim();
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    if (src_nd == 0) {
+        src_shape_vec = {0};
+        src_strides_vec = {0};
+    }
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events{};
+
+    auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid];
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, src_shape_vec, src_strides_vec);
+    auto packed_src_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple1));
+    sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+    const py::ssize_t *packed_src_shape_strides =
+        packed_src_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shapes_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event repeat_ev = fn(exec_q, dst_sz, src_data_p, dst_data_p, reps,
+                               src_nd, packed_src_shape_strides,
+                               dst_shape_vec[0], dst_strides_vec[0], all_deps);
+
+    sycl::event cleanup_tmp_allocations_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {repeat_ev}, packed_src_shape_strides_owner);
+    host_task_events.push_back(cleanup_tmp_allocations_ev);
+
+    sycl::event py_obj_management_host_task_ev =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/repeat.hpp b/dpnp/tensor/libtensor/source/repeat.hpp
new file mode 100644
index 000000000000..5835377fb29c
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/repeat.hpp
@@ -0,0 +1,83 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_repeat_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event>
+    py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                          const dpctl::tensor::usm_ndarray &dst,
+                          const dpctl::tensor::usm_ndarray &reps,
+                          const dpctl::tensor::usm_ndarray &cumsum,
+                          int axis,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends);
+
+extern std::pair<sycl::event, sycl::event>
+    py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
+                          const dpctl::tensor::usm_ndarray &dst,
+                          const dpctl::tensor::usm_ndarray &reps,
+                          const dpctl::tensor::usm_ndarray &cumsum,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends);
+
+extern std::pair<sycl::event, sycl::event>
+    py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        const py::ssize_t reps,
+                        int axis,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends);
+
+extern std::pair<sycl::event, sycl::event>
+    py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
+                        const dpctl::tensor::usm_ndarray &dst,
+                        const py::ssize_t reps,
+                        sycl::queue &exec_q,
+                        const std::vector<sycl::event> &depends);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp b/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp
new file mode 100644
index 000000000000..573aaeb0a60b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -0,0 +1,542 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+#include <vector>
+
+#include <pybind11/pybind11.h>
+
+#include "simplify_iteration_space.hpp"
+#include "utils/strided_iters.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+
+void simplify_iteration_space_1(int &nd,
+                                const py::ssize_t *const &shape,
+                                std::vector<py::ssize_t> const &strides,
+                                // output
+                                std::vector<py::ssize_t> &simplified_shape,
+                                std::vector<py::ssize_t> &simplified_strides,
+                                py::ssize_t &offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_stride;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
+
+        simplified_strides.reserve(nd);
+        simplified_strides.insert(std::end(simplified_strides),
+                                  std::begin(strides), std::end(strides));
+
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+        assert(simplified_strides.size() == static_cast<std::size_t>(nd));
+        int contracted_nd = simplify_iteration_stride(
+            nd, simplified_shape.data(), simplified_strides.data(),
+            offset // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+
+        simplified_strides.reserve(nd);
+        simplified_strides.push_back((strides[0] >= 0) ? strides[0]
+                                                       : -strides[0]);
+        if ((strides[0] < 0) && (shape[0] > 1)) {
+            offset += (shape[0] - 1) * strides[0];
+        }
+
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+        assert(simplified_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void simplify_iteration_space(int &nd,
+                              const py::ssize_t *const &shape,
+                              std::vector<py::ssize_t> const &src_strides,
+                              std::vector<py::ssize_t> const &dst_strides,
+                              // output
+                              std::vector<py::ssize_t> &simplified_shape,
+                              std::vector<py::ssize_t> &simplified_src_strides,
+                              std::vector<py::ssize_t> &simplified_dst_strides,
+                              py::ssize_t &src_offset,
+                              py::ssize_t &dst_offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_two_strides;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::begin(simplified_shape), shape,
+                                shape + nd);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src_strides.reserve(nd);
+        simplified_src_strides.insert(std::end(simplified_src_strides),
+                                      std::begin(src_strides),
+                                      std::end(src_strides));
+        assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.insert(std::end(simplified_dst_strides),
+                                      std::begin(dst_strides),
+                                      std::end(dst_strides));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd = simplify_iteration_two_strides(
+            nd, simplified_shape.data(), simplified_src_strides.data(),
+            simplified_dst_strides.data(),
+            src_offset, // modified by reference
+            dst_offset  // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        src_offset = 0;
+        dst_offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if (src_strides[0] < 0 && dst_strides[0] < 0) {
+            simplified_src_strides.push_back(-src_strides[0]);
+            simplified_dst_strides.push_back(-dst_strides[0]);
+            if (shape[0] > 1) {
+                src_offset += (shape[0] - 1) * src_strides[0];
+                dst_offset += (shape[0] - 1) * dst_strides[0];
+            }
+        }
+        else {
+            simplified_src_strides.push_back(src_strides[0]);
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void simplify_iteration_space_3(
+    int &nd,
+    const py::ssize_t *const &shape,
+    // src1
+    std::vector<py::ssize_t> const &src1_strides,
+    // src2
+    std::vector<py::ssize_t> const &src2_strides,
+    // dst
+    std::vector<py::ssize_t> const &dst_strides,
+    // output
+    std::vector<py::ssize_t> &simplified_shape,
+    std::vector<py::ssize_t> &simplified_src1_strides,
+    std::vector<py::ssize_t> &simplified_src2_strides,
+    std::vector<py::ssize_t> &simplified_dst_strides,
+    py::ssize_t &src1_offset,
+    py::ssize_t &src2_offset,
+    py::ssize_t &dst_offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_three_strides;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src1_strides.insert(std::end(simplified_src1_strides),
+                                       std::begin(src1_strides),
+                                       std::end(src1_strides));
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_src2_strides.reserve(nd);
+        simplified_src2_strides.insert(std::end(simplified_src2_strides),
+                                       std::begin(src2_strides),
+                                       std::end(src2_strides));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.insert(std::end(simplified_dst_strides),
+                                      std::begin(dst_strides),
+                                      std::end(dst_strides));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd = simplify_iteration_three_strides(
+            nd, simplified_shape.data(), simplified_src1_strides.data(),
+            simplified_src2_strides.data(), simplified_dst_strides.data(),
+            src1_offset, // modified by reference
+            src2_offset, // modified by reference
+            dst_offset   // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src1_strides.resize(contracted_nd);
+        simplified_src2_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        src1_offset = 0;
+        src2_offset = 0;
+        dst_offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src2_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
+            (dst_strides[0] < 0)) {
+            simplified_src1_strides.push_back(-src1_strides[0]);
+            simplified_src2_strides.push_back(-src2_strides[0]);
+            simplified_dst_strides.push_back(-dst_strides[0]);
+            if (shape[0] > 1) {
+                src1_offset += src1_strides[0] * (shape[0] - 1);
+                src2_offset += src2_strides[0] * (shape[0] - 1);
+                dst_offset += dst_strides[0] * (shape[0] - 1);
+            }
+        }
+        else {
+            simplified_src1_strides.push_back(src1_strides[0]);
+            simplified_src2_strides.push_back(src2_strides[0]);
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void simplify_iteration_space_4(
+    int &nd,
+    const py::ssize_t *const &shape,
+    // src1
+    std::vector<py::ssize_t> const &src1_strides,
+    // src2
+    std::vector<py::ssize_t> const &src2_strides,
+    // src3
+    std::vector<py::ssize_t> const &src3_strides,
+    // dst
+    std::vector<py::ssize_t> const &dst_strides,
+    // output
+    std::vector<py::ssize_t> &simplified_shape,
+    std::vector<py::ssize_t> &simplified_src1_strides,
+    std::vector<py::ssize_t> &simplified_src2_strides,
+    std::vector<py::ssize_t> &simplified_src3_strides,
+    std::vector<py::ssize_t> &simplified_dst_strides,
+    py::ssize_t &src1_offset,
+    py::ssize_t &src2_offset,
+    py::ssize_t &src3_offset,
+    py::ssize_t &dst_offset)
+{
+    using dpctl::tensor::strides::simplify_iteration_four_strides;
+    if (nd > 1) {
+        // Simplify iteration space to reduce dimensionality
+        // and improve access pattern
+        simplified_shape.reserve(nd);
+        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src1_strides.insert(std::end(simplified_src1_strides),
+                                       std::begin(src1_strides),
+                                       std::end(src1_strides));
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_src2_strides.reserve(nd);
+        simplified_src2_strides.insert(std::end(simplified_src2_strides),
+                                       std::begin(src2_strides),
+                                       std::end(src2_strides));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_src3_strides.reserve(nd);
+        simplified_src3_strides.insert(std::end(simplified_src3_strides),
+                                       std::begin(src3_strides),
+                                       std::end(src3_strides));
+        assert(simplified_src3_strides.size() == static_cast<std::size_t>(nd));
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.insert(std::end(simplified_dst_strides),
+                                      std::begin(dst_strides),
+                                      std::end(dst_strides));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd = simplify_iteration_four_strides(
+            nd, simplified_shape.data(), simplified_src1_strides.data(),
+            simplified_src2_strides.data(), simplified_src3_strides.data(),
+            simplified_dst_strides.data(),
+            src1_offset, // modified by reference
+            src2_offset, // modified by reference
+            src3_offset, // modified by reference
+            dst_offset   // modified by reference
+        );
+        simplified_shape.resize(contracted_nd);
+        simplified_src1_strides.resize(contracted_nd);
+        simplified_src2_strides.resize(contracted_nd);
+        simplified_src3_strides.resize(contracted_nd);
+        simplified_dst_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        src1_offset = 0;
+        src2_offset = 0;
+        src3_offset = 0;
+        dst_offset = 0;
+        // Populate vectors
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(shape[0]);
+        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+
+        simplified_src1_strides.reserve(nd);
+        simplified_src2_strides.reserve(nd);
+        simplified_src3_strides.reserve(nd);
+        simplified_dst_strides.reserve(nd);
+
+        if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
+            (src3_strides[0] < 0) && (dst_strides[0] < 0)) {
+            simplified_src1_strides.push_back(-src1_strides[0]);
+            simplified_src2_strides.push_back(-src2_strides[0]);
+            simplified_src3_strides.push_back(-src3_strides[0]);
+            simplified_dst_strides.push_back(-dst_strides[0]);
+            if (shape[0] > 1) {
+                src1_offset += src1_strides[0] * (shape[0] - 1);
+                src2_offset += src2_strides[0] * (shape[0] - 1);
+                src3_offset += src3_strides[0] * (shape[0] - 1);
+                dst_offset += dst_strides[0] * (shape[0] - 1);
+            }
+        }
+        else {
+            simplified_src1_strides.push_back(src1_strides[0]);
+            simplified_src2_strides.push_back(src2_strides[0]);
+            simplified_src3_strides.push_back(src3_strides[0]);
+            simplified_dst_strides.push_back(dst_strides[0]);
+        }
+
+        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_src3_strides.size() == static_cast<std::size_t>(nd));
+        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+void compact_iteration_space(int &nd,
+                             const py::ssize_t *const &shape,
+                             std::vector<py::ssize_t> const &strides,
+                             // output
+                             std::vector<py::ssize_t> &compact_shape,
+                             std::vector<py::ssize_t> &compact_strides)
+{
+    using dpctl::tensor::strides::compact_iteration;
+    if (nd > 1) {
+        // Compact iteration space to reduce dimensionality
+        // and improve access pattern
+        compact_shape.reserve(nd);
+        compact_shape.insert(std::begin(compact_shape), shape, shape + nd);
+        assert(compact_shape.size() == static_cast<std::size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.insert(std::end(compact_strides), std::begin(strides),
+                               std::end(strides));
+        assert(compact_strides.size() == static_cast<std::size_t>(nd));
+
+        int contracted_nd =
+            compact_iteration(nd, compact_shape.data(), compact_strides.data());
+        compact_shape.resize(contracted_nd);
+        compact_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        // Populate vectors
+        compact_shape.reserve(nd);
+        compact_shape.push_back(shape[0]);
+        assert(compact_shape.size() == static_cast<std::size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.push_back(strides[0]);
+        assert(compact_strides.size() == static_cast<std::size_t>(nd));
+    }
+}
+
+/* @brief Split shape/strides into dir1 (complementary to axis_start <= i <
+ * axis_end) and dir2 (along given set of axes)
+ */
+void split_iteration_space(const std::vector<py::ssize_t> &shape_vec,
+                           const std::vector<py::ssize_t> &strides_vec,
+                           int axis_start,
+                           int axis_end,
+                           std::vector<py::ssize_t> &dir1_shape_vec,
+                           std::vector<py::ssize_t> &dir2_shape_vec,
+                           std::vector<py::ssize_t> &dir1_strides_vec,
+                           std::vector<py::ssize_t> &dir2_strides_vec)
+{
+    int nd = static_cast<int>(shape_vec.size());
+    int dir2_sz = axis_end - axis_start;
+    int dir1_sz = nd - dir2_sz;
+
+    assert(dir1_sz > 0);
+    assert(dir2_sz > 0);
+
+    dir1_shape_vec.resize(dir1_sz);
+    dir2_shape_vec.resize(dir2_sz);
+
+    std::copy(shape_vec.begin(), shape_vec.begin() + axis_start,
+              dir1_shape_vec.begin());
+    std::copy(shape_vec.begin() + axis_end, shape_vec.end(),
+              dir1_shape_vec.begin() + axis_start);
+
+    std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end,
+              dir2_shape_vec.begin());
+
+    dir1_strides_vec.resize(dir1_sz);
+    dir2_strides_vec.resize(dir2_sz);
+
+    std::copy(strides_vec.begin(), strides_vec.begin() + axis_start,
+              dir1_strides_vec.begin());
+    std::copy(strides_vec.begin() + axis_end, strides_vec.end(),
+              dir1_strides_vec.begin() + axis_start);
+
+    std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end,
+              dir2_strides_vec.begin());
+
+    return;
+}
+
+py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &mi,
+                                 std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    if (nd != mi.size()) {
+        throw py::value_error(
+            "Multi-index and shape vectors must have the same length.");
+    }
+
+    py::ssize_t flat_index = 0;
+    py::ssize_t s = 1;
+    for (std::size_t i = 0; i < nd; ++i) {
+        flat_index += mi.at(nd - 1 - i) * s;
+        s *= shape.at(nd - 1 - i);
+    }
+
+    return flat_index;
+}
+
+py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &mi,
+                                 std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    if (nd != mi.size()) {
+        throw py::value_error(
+            "Multi-index and shape vectors must have the same length.");
+    }
+
+    py::ssize_t flat_index = 0;
+    py::ssize_t s = 1;
+    for (std::size_t i = 0; i < nd; ++i) {
+        flat_index += mi.at(i) * s;
+        s *= shape.at(i);
+    }
+
+    return flat_index;
+}
+
+std::vector<py::ssize_t> _unravel_index_c(py::ssize_t flat_index,
+                                          std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    std::vector<py::ssize_t> mi;
+    mi.resize(nd);
+
+    py::ssize_t i_ = flat_index;
+    for (std::size_t dim = 0; dim + 1 < nd; ++dim) {
+        const py::ssize_t si = shape[nd - 1 - dim];
+        const py::ssize_t q = i_ / si;
+        const py::ssize_t r = (i_ - q * si);
+        mi[nd - 1 - dim] = r;
+        i_ = q;
+    }
+    if (nd) {
+        mi[0] = i_;
+    }
+    return mi;
+}
+
+std::vector<py::ssize_t> _unravel_index_f(py::ssize_t flat_index,
+                                          std::vector<py::ssize_t> const &shape)
+{
+    std::size_t nd = shape.size();
+    std::vector<py::ssize_t> mi;
+    mi.resize(nd);
+
+    py::ssize_t i_ = flat_index;
+    for (std::size_t dim = 0; dim + 1 < nd; ++dim) {
+        const py::ssize_t si = shape[dim];
+        const py::ssize_t q = i_ / si;
+        const py::ssize_t r = (i_ - q * si);
+        mi[dim] = r;
+        i_ = q;
+    }
+    if (nd) {
+        mi[nd - 1] = i_;
+    }
+    return mi;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/simplify_iteration_space.hpp b/dpnp/tensor/libtensor/source/simplify_iteration_space.hpp
new file mode 100644
index 000000000000..acbc833157d1
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/simplify_iteration_space.hpp
@@ -0,0 +1,125 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+#include <vector>
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+
+void simplify_iteration_space_1(int &,
+                                const py::ssize_t *const &,
+                                std::vector<py::ssize_t> const &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &);
+
+void simplify_iteration_space(int &,
+                              const py::ssize_t *const &,
+                              std::vector<py::ssize_t> const &,
+                              std::vector<py::ssize_t> const &,
+                              std::vector<py::ssize_t> &,
+                              std::vector<py::ssize_t> &,
+                              std::vector<py::ssize_t> &,
+                              py::ssize_t &,
+                              py::ssize_t &);
+
+void simplify_iteration_space_3(int &,
+                                const py::ssize_t *const &,
+                                // src1
+                                std::vector<py::ssize_t> const &,
+                                // src2
+                                std::vector<py::ssize_t> const &,
+                                // dst
+                                std::vector<py::ssize_t> const &,
+                                // output
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &);
+
+void simplify_iteration_space_4(int &,
+                                const py::ssize_t *const &,
+                                // src1
+                                std::vector<py::ssize_t> const &,
+                                // src2
+                                std::vector<py::ssize_t> const &,
+                                // src3
+                                std::vector<py::ssize_t> const &,
+                                // dst
+                                std::vector<py::ssize_t> const &,
+                                // output
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                std::vector<py::ssize_t> &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &,
+                                py::ssize_t &);
+
+void compact_iteration_space(int &,
+                             const py::ssize_t *const &,
+                             std::vector<py::ssize_t> const &,
+                             // output
+                             std::vector<py::ssize_t> &,
+                             std::vector<py::ssize_t> &);
+
+void split_iteration_space(const std::vector<py::ssize_t> &,
+                           const std::vector<py::ssize_t> &,
+                           int,
+                           int,
+                           // output
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &);
+
+py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &,
+                                 std::vector<py::ssize_t> const &);
+py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &,
+                                 std::vector<py::ssize_t> const &);
+std::vector<py::ssize_t> _unravel_index_c(py::ssize_t,
+                                          std::vector<py::ssize_t> const &);
+std::vector<py::ssize_t> _unravel_index_f(py::ssize_t,
+                                          std::vector<py::ssize_t> const &);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/isin.cpp b/dpnp/tensor/libtensor/source/sorting/isin.cpp
new file mode 100644
index 000000000000..f1ae5863bbb9
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/isin.cpp
@@ -0,0 +1,325 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/sorting/isin.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+namespace detail
+{
+
+using dpctl::tensor::kernels::isin_contig_impl_fp_ptr_t;
+
+static isin_contig_impl_fp_ptr_t
+    isin_contig_impl_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename argTy>
+struct IsinContigFactory
+{
+    constexpr IsinContigFactory() {}
+
+    fnT get() const
+    {
+        using dpctl::tensor::kernels::isin_contig_impl;
+        return isin_contig_impl<argTy>;
+    }
+};
+
+using dpctl::tensor::kernels::isin_strided_impl_fp_ptr_t;
+
+static isin_strided_impl_fp_ptr_t
+    isin_strided_impl_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename argTy>
+struct IsinStridedFactory
+{
+    constexpr IsinStridedFactory() {}
+
+    fnT get() const
+    {
+        using dpctl::tensor::kernels::isin_strided_impl;
+        return isin_strided_impl<argTy>;
+    }
+};
+
+void init_isin_dispatch_vector(void)
+{
+
+    // Contiguous input function dispatch
+    td_ns::DispatchVectorBuilder<isin_contig_impl_fp_ptr_t, IsinContigFactory,
+                                 td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(isin_contig_impl_dispatch_vector);
+
+    // Strided input function dispatch
+    td_ns::DispatchVectorBuilder<isin_strided_impl_fp_ptr_t, IsinStridedFactory,
+                                 td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(isin_strided_impl_dispatch_vector);
+}
+
+} // namespace detail
+
+/*! @brief search for needle from needles in sorted hay */
+std::pair<sycl::event, sycl::event>
+    py_isin(const dpctl::tensor::usm_ndarray &needles,
+            const dpctl::tensor::usm_ndarray &hay,
+            const dpctl::tensor::usm_ndarray &dst,
+            sycl::queue &exec_q,
+            const bool invert,
+            const std::vector<sycl::event> &depends)
+{
+    const int hay_nd = hay.get_ndim();
+    const int needles_nd = needles.get_ndim();
+    const int dst_nd = dst.get_ndim();
+
+    if (hay_nd != 1 || needles_nd != dst_nd) {
+        throw py::value_error("Array dimensions mismatch");
+    }
+
+    // check that needle and dst have the same shape
+    std::size_t needles_nelems(1);
+    bool same_shape(true);
+
+    const std::size_t hay_nelems = static_cast<std::size_t>(hay.get_shape(0));
+
+    const py::ssize_t *needles_shape_ptr = needles.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    for (int i = 0; (i < needles_nd) && same_shape; ++i) {
+        const auto needles_sh_i = needles_shape_ptr[i];
+        const auto dst_sh_i = dst_shape_ptr[i];
+
+        same_shape = same_shape && (needles_sh_i == dst_sh_i);
+        needles_nelems *= static_cast<std::size_t>(needles_sh_i);
+    }
+
+    if (!same_shape) {
+        throw py::value_error(
+            "Array of values to search for and array of their "
+            "dst do not have the same shape");
+    }
+
+    // check that dst is ample enough
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst,
+                                                               needles_nelems);
+
+    // check that dst is writable
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {hay, needles, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // if output array overlaps with input arrays, race condition results
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(dst, hay) || overlap(dst, needles)) {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    const int hay_typenum = hay.get_typenum();
+    const int needles_typenum = needles.get_typenum();
+    const int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    const int hay_typeid = array_types.typenum_to_lookup_id(hay_typenum);
+    const int needles_typeid =
+        array_types.typenum_to_lookup_id(needles_typenum);
+    const int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    // check hay and needle have the same data-type
+    if (needles_typeid != hay_typeid) {
+        throw py::value_error(
+            "Hay array and needles array must have the same data types");
+    }
+    // check that dst has boolean data type
+    const auto dst_typenum_t_v = static_cast<td_ns::typenum_t>(dst_typeid);
+    if (dst_typenum_t_v != td_ns::typenum_t::BOOL) {
+        throw py::value_error("dst array must have data-type bool");
+    }
+
+    if (needles_nelems == 0) {
+        // Nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    // if all inputs are contiguous call contiguous implementations
+    // otherwise call strided implementation
+    const bool hay_is_c_contig = hay.is_c_contiguous();
+    const bool hay_is_f_contig = hay.is_f_contiguous();
+
+    const bool needles_is_c_contig = needles.is_c_contiguous();
+    const bool needles_is_f_contig = needles.is_f_contiguous();
+
+    const bool dst_is_c_contig = dst.is_c_contiguous();
+    const bool dst_is_f_contig = dst.is_f_contiguous();
+
+    const bool all_c_contig =
+        (hay_is_c_contig && needles_is_c_contig && dst_is_c_contig);
+    const bool all_f_contig =
+        (hay_is_f_contig && needles_is_f_contig && dst_is_f_contig);
+
+    const char *hay_data = hay.get_data();
+    const char *needles_data = needles.get_data();
+
+    char *dst_data = dst.get_data();
+
+    if (all_c_contig || all_f_contig) {
+        auto fn = detail::isin_contig_impl_dispatch_vector[hay_typeid];
+
+        static constexpr py::ssize_t zero_offset(0);
+
+        sycl::event comp_ev = fn(exec_q, invert, hay_nelems, needles_nelems,
+                                 hay_data, zero_offset, needles_data,
+                                 zero_offset, dst_data, zero_offset, depends);
+
+        return std::make_pair(dpctl::utils::keep_args_alive(
+                                  exec_q, {hay, needles, dst}, {comp_ev}),
+                              comp_ev);
+    }
+
+    // strided case
+
+    const auto &needles_strides = needles.get_strides_vector();
+    const auto &dst_strides = dst.get_strides_vector();
+
+    int simplified_nd = needles_nd;
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_common_shape;
+    shT simplified_needles_strides;
+    shT simplified_dst_strides;
+    py::ssize_t needles_offset(0);
+    py::ssize_t dst_offset(0);
+
+    if (simplified_nd == 0) {
+        // needles and dst have same nd
+        simplified_nd = 1;
+        simplified_common_shape.push_back(1);
+        simplified_needles_strides.push_back(0);
+        simplified_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            // modified by reference
+            simplified_nd,
+            // read-only inputs
+            needles_shape_ptr, needles_strides, dst_strides,
+            // output, modified by reference
+            simplified_common_shape, simplified_needles_strides,
+            simplified_dst_strides, needles_offset, dst_offset);
+    }
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // vectors being packed
+        simplified_common_shape, simplified_needles_strides,
+        simplified_dst_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_strides_ev =
+        std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    auto strided_fn = detail::isin_strided_impl_dispatch_vector[hay_typeid];
+
+    if (!strided_fn) {
+        throw std::runtime_error(
+            "No implementation for data types of input arrays");
+    }
+
+    static constexpr py::ssize_t zero_offset(0);
+    py::ssize_t hay_step = hay.get_strides_vector()[0];
+
+    const sycl::event &comp_ev = strided_fn(
+        exec_q, invert, hay_nelems, needles_nelems, hay_data, zero_offset,
+        hay_step, needles_data, needles_offset, dst_data, dst_offset,
+        simplified_nd, packed_shape_strides, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {comp_ev}, packed_shape_strides_owner);
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+    const sycl::event &ht_ev = dpctl::utils::keep_args_alive(
+        exec_q, {hay, needles, dst}, host_task_events);
+
+    return std::make_pair(ht_ev, comp_ev);
+}
+
+void init_isin_functions(py::module_ m)
+{
+    detail::init_isin_dispatch_vector();
+
+    m.def("_isin", &py_isin, py::arg("needles"), py::arg("hay"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("invert"),
+          py::arg("depends") = py::list());
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/isin.hpp b/dpnp/tensor/libtensor/source/sorting/isin.hpp
new file mode 100644
index 000000000000..236e8b5898c6
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/isin.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_isin_functions(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp b/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp
new file mode 100644
index 000000000000..11df5cd2ef47
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/merge_argsort.cpp
@@ -0,0 +1,155 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/rich_comparisons.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/sorting/merge_sort.hpp"
+#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
+
+#include "merge_argsort.hpp"
+#include "py_argsort_common.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+static sort_contig_fn_ptr_t
+    ascending_argsort_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_argsort_contig_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct AscendingArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
+                      std::is_same_v<IndexTy, std::int32_t>) {
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+            using Comp = typename AscendingSorter<argTy>::type;
+
+            using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
+            return stable_argsort_axis1_contig_impl<argTy, IndexTy, Comp>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct DescendingArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
+                      std::is_same_v<IndexTy, std::int32_t>) {
+            using dpctl::tensor::rich_comparisons::DescendingSorter;
+            using Comp = typename DescendingSorter<argTy>::type;
+
+            using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
+            return stable_argsort_axis1_contig_impl<argTy, IndexTy, Comp>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_merge_argsort_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
+                                AscendingArgSortContigFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(ascending_argsort_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<
+        sort_contig_fn_ptr_t, DescendingArgSortContigFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(descending_argsort_contig_dispatch_table);
+}
+
+void init_merge_argsort_functions(py::module_ m)
+{
+    init_merge_argsort_dispatch_tables();
+
+    auto py_argsort_ascending = [](const dpctl::tensor::usm_ndarray &src,
+                                   const int trailing_dims_to_sort,
+                                   const dpctl::tensor::usm_ndarray &dst,
+                                   sycl::queue &exec_q,
+                                   const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                          ascending_argsort_contig_dispatch_table);
+    };
+    m.def("_argsort_ascending", py_argsort_ascending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_argsort_descending = [](const dpctl::tensor::usm_ndarray &src,
+                                    const int trailing_dims_to_sort,
+                                    const dpctl::tensor::usm_ndarray &dst,
+                                    sycl::queue &exec_q,
+                                    const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                          descending_argsort_contig_dispatch_table);
+    };
+    m.def("_argsort_descending", py_argsort_descending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/merge_argsort.hpp b/dpnp/tensor/libtensor/source/sorting/merge_argsort.hpp
new file mode 100644
index 000000000000..10777b4bc2fd
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/merge_argsort.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_merge_argsort_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/merge_sort.cpp b/dpnp/tensor/libtensor/source/sorting/merge_sort.cpp
new file mode 100644
index 000000000000..fbd60621b3bb
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/merge_sort.cpp
@@ -0,0 +1,139 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/rich_comparisons.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/sorting/merge_sort.hpp"
+#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
+
+#include "merge_sort.hpp"
+#include "py_sort_common.hpp"
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+static sort_contig_fn_ptr_t
+    ascending_sort_contig_dispatch_vector[td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_sort_contig_dispatch_vector[td_ns::num_types];
+
+template <typename fnT, typename argTy>
+struct AscendingSortContigFactory
+{
+    fnT get()
+    {
+        using dpctl::tensor::rich_comparisons::AscendingSorter;
+        using Comp = typename AscendingSorter<argTy>::type;
+
+        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
+        return stable_sort_axis1_contig_impl<argTy, Comp>;
+    }
+};
+
+template <typename fnT, typename argTy>
+struct DescendingSortContigFactory
+{
+    fnT get()
+    {
+        using dpctl::tensor::rich_comparisons::DescendingSorter;
+        using Comp = typename DescendingSorter<argTy>::type;
+
+        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
+        return stable_sort_axis1_contig_impl<argTy, Comp>;
+    }
+};
+
+void init_merge_sort_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
+                                 AscendingSortContigFactory, td_ns::num_types>
+        dtv1;
+    dtv1.populate_dispatch_vector(ascending_sort_contig_dispatch_vector);
+
+    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
+                                 DescendingSortContigFactory, td_ns::num_types>
+        dtv2;
+    dtv2.populate_dispatch_vector(descending_sort_contig_dispatch_vector);
+}
+
+void init_merge_sort_functions(py::module_ m)
+{
+    init_merge_sort_dispatch_vectors();
+
+    auto py_sort_ascending = [](const dpctl::tensor::usm_ndarray &src,
+                                const int trailing_dims_to_sort,
+                                const dpctl::tensor::usm_ndarray &dst,
+                                sycl::queue &exec_q,
+                                const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                       ascending_sort_contig_dispatch_vector);
+    };
+    m.def("_sort_ascending", py_sort_ascending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_sort_descending = [](const dpctl::tensor::usm_ndarray &src,
+                                 const int trailing_dims_to_sort,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                       descending_sort_contig_dispatch_vector);
+    };
+    m.def("_sort_descending", py_sort_descending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/merge_sort.hpp b/dpnp/tensor/libtensor/source/sorting/merge_sort.hpp
new file mode 100644
index 000000000000..a6bdd0a4efe9
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/merge_sort.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_merge_sort_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp b/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp
new file mode 100644
index 000000000000..018f3166a0ad
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/py_argsort_common.hpp
@@ -0,0 +1,183 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+template <typename sorting_contig_impl_fnT>
+std::pair<sycl::event, sycl::event>
+    py_argsort(const dpctl::tensor::usm_ndarray &src,
+               const int trailing_dims_to_sort,
+               const dpctl::tensor::usm_ndarray &dst,
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends,
+               const sorting_contig_impl_fnT &sort_contig_fns)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iteration_nd = src_nd - trailing_dims_to_sort;
+    if (trailing_dims_to_sort <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_sort must be positive, but no "
+                              "greater than rank of the array being sorted");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+
+    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    std::size_t sort_nelems(1);
+    for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        sort_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (sort_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, sort_nelems * iter_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if ((dst_typeid != static_cast<int>(td_ns::typenum_t::INT64)) &&
+        (dst_typeid != static_cast<int>(td_ns::typenum_t::INT32))) {
+        throw py::value_error(
+            "Output index array must have data type int32 or int64");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        if (sort_nelems > 1) {
+            static constexpr py::ssize_t zero_offset = py::ssize_t(0);
+
+            auto fn = sort_contig_fns[src_typeid][dst_typeid];
+
+            if (fn == nullptr) {
+                throw py::value_error(
+                    "Not implemented for dtypes of input arrays");
+            }
+
+            sycl::event comp_ev =
+                fn(exec_q, iter_nelems, sort_nelems, src.get_data(),
+                   dst.get_data(), zero_offset, zero_offset, zero_offset,
+                   zero_offset, depends);
+
+            sycl::event keep_args_alive_ev =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev});
+
+            return std::make_pair(keep_args_alive_ev, comp_ev);
+        }
+        else {
+            assert(dst.get_size() == iter_nelems);
+            int dst_elemsize = dst.get_elemsize();
+            static constexpr int memset_val(0);
+
+            sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+
+                cgh.memset(reinterpret_cast<void *>(dst.get_data()), memset_val,
+                           iter_nelems * dst_elemsize);
+            });
+
+            sycl::event keep_args_alive_ev =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {fill_ev});
+
+            return std::make_pair(keep_args_alive_ev, fill_ev);
+        }
+    }
+
+    throw py::value_error(
+        "Both source and destination arrays must be C-contiguous");
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/py_sort_common.hpp b/dpnp/tensor/libtensor/source/sorting/py_sort_common.hpp
new file mode 100644
index 000000000000..ee8777f35077
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/py_sort_common.hpp
@@ -0,0 +1,178 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+template <typename sorting_contig_impl_fnT>
+std::pair<sycl::event, sycl::event>
+    py_sort(const dpctl::tensor::usm_ndarray &src,
+            const int trailing_dims_to_sort,
+            const dpctl::tensor::usm_ndarray &dst,
+            sycl::queue &exec_q,
+            const std::vector<sycl::event> &depends,
+            const sorting_contig_impl_fnT &sort_contig_fns)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iteration_nd = src_nd - trailing_dims_to_sort;
+    if (trailing_dims_to_sort <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_sort must be positive, but no "
+                              "greater than rank of the array being sorted");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+
+    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    std::size_t sort_nelems(1);
+    for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        sort_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (sort_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, sort_nelems * iter_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error("Both input arrays must have "
+                              "the same value data type");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        if (sort_nelems > 1) {
+            static constexpr py::ssize_t zero_offset = py::ssize_t(0);
+
+            auto fn = sort_contig_fns[src_typeid];
+
+            if (nullptr == fn) {
+                throw py::value_error(
+                    "Not implemented for the dtype of input arrays");
+            }
+
+            sycl::event comp_ev =
+                fn(exec_q, iter_nelems, sort_nelems, src.get_data(),
+                   dst.get_data(), zero_offset, zero_offset, zero_offset,
+                   zero_offset, depends);
+
+            sycl::event keep_args_alive_ev =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev});
+
+            return std::make_pair(keep_args_alive_ev, comp_ev);
+        }
+        else {
+            assert(dst.get_size() == iter_nelems);
+            int src_elemsize = src.get_elemsize();
+
+            sycl::event copy_ev =
+                exec_q.copy<char>(src.get_data(), dst.get_data(),
+                                  src_elemsize * iter_nelems, depends);
+
+            return std::make_pair(
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                copy_ev);
+        }
+    }
+
+    throw py::value_error(
+        "Both source and destination arrays must be C-contiguous");
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp b/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp
new file mode 100644
index 000000000000..0eec8fba9ded
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/radix_argsort.cpp
@@ -0,0 +1,185 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/radix_sort.hpp"
+#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
+
+#include "py_argsort_common.hpp"
+#include "radix_argsort.hpp"
+#include "radix_sort_support.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace impl_ns = dpctl::tensor::kernels::radix_sort_details;
+
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+static sort_contig_fn_ptr_t
+    ascending_radix_argsort_contig_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_radix_argsort_contig_dispatch_table[td_ns::num_types]
+                                                  [td_ns::num_types];
+
+namespace
+{
+
+template <bool is_ascending, typename T, typename I>
+sycl::event argsort_axis1_contig_caller(sycl::queue &q,
+                                        std::size_t iter_nelems,
+                                        std::size_t sort_nelems,
+                                        const char *arg_cp,
+                                        char *res_cp,
+                                        ssize_t iter_arg_offset,
+                                        ssize_t iter_res_offset,
+                                        ssize_t sort_arg_offset,
+                                        ssize_t sort_res_offset,
+                                        const std::vector<sycl::event> &depends)
+{
+    using dpctl::tensor::kernels::radix_argsort_axis1_contig_impl;
+
+    return radix_argsort_axis1_contig_impl<T, I>(
+        q, is_ascending, iter_nelems, sort_nelems, arg_cp, res_cp,
+        iter_arg_offset, iter_res_offset, sort_arg_offset, sort_res_offset,
+        depends);
+}
+
+} // end of anonymous namespace
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct AscendingRadixArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined &&
+                      (std::is_same_v<IndexTy, std::int64_t> ||
+                       std::is_same_v<IndexTy, std::int32_t>)) {
+            return argsort_axis1_contig_caller<
+                /*ascending*/ true, argTy, IndexTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct DescendingRadixArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined &&
+                      (std::is_same_v<IndexTy, std::int64_t> ||
+                       std::is_same_v<IndexTy, std::int32_t>)) {
+            return argsort_axis1_contig_caller<
+                /*ascending*/ false, argTy, IndexTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_radix_argsort_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
+                                AscendingRadixArgSortContigFactory,
+                                td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(ascending_radix_argsort_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
+                                DescendingRadixArgSortContigFactory,
+                                td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(
+        descending_radix_argsort_contig_dispatch_table);
+}
+
+void init_radix_argsort_functions(py::module_ m)
+{
+    init_radix_argsort_dispatch_tables();
+
+    auto py_radix_argsort_ascending =
+        [](const dpctl::tensor::usm_ndarray &src,
+           const int trailing_dims_to_sort,
+           const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
+           const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                          ascending_radix_argsort_contig_dispatch_table);
+    };
+    m.def("_radix_argsort_ascending", py_radix_argsort_ascending,
+          py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_radix_argsort_descending =
+        [](const dpctl::tensor::usm_ndarray &src,
+           const int trailing_dims_to_sort,
+           const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
+           const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                          descending_radix_argsort_contig_dispatch_table);
+    };
+    m.def("_radix_argsort_descending", py_radix_argsort_descending,
+          py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/radix_argsort.hpp b/dpnp/tensor/libtensor/source/sorting/radix_argsort.hpp
new file mode 100644
index 000000000000..89013fbb1bdc
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/radix_argsort.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_radix_argsort_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/radix_sort.cpp b/dpnp/tensor/libtensor/source/sorting/radix_sort.cpp
new file mode 100644
index 000000000000..35c71a0eb7d3
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/radix_sort.cpp
@@ -0,0 +1,188 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <exception>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/radix_sort.hpp"
+#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
+
+#include "py_sort_common.hpp"
+#include "radix_sort.hpp"
+#include "radix_sort_support.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace impl_ns = dpctl::tensor::kernels::radix_sort_details;
+
+using dpctl::tensor::ssize_t;
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+static sort_contig_fn_ptr_t
+    ascending_radix_sort_contig_dispatch_vector[td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_radix_sort_contig_dispatch_vector[td_ns::num_types];
+
+namespace
+{
+
+template <bool is_ascending, typename T>
+sycl::event sort_axis1_contig_caller(sycl::queue &q,
+                                     std::size_t iter_nelems,
+                                     std::size_t sort_nelems,
+                                     const char *arg_cp,
+                                     char *res_cp,
+                                     ssize_t iter_arg_offset,
+                                     ssize_t iter_res_offset,
+                                     ssize_t sort_arg_offset,
+                                     ssize_t sort_res_offset,
+                                     const std::vector<sycl::event> &depends)
+{
+    using dpctl::tensor::kernels::radix_sort_axis1_contig_impl;
+
+    return radix_sort_axis1_contig_impl<T>(
+        q, is_ascending, iter_nelems, sort_nelems, arg_cp, res_cp,
+        iter_arg_offset, iter_res_offset, sort_arg_offset, sort_res_offset,
+        depends);
+}
+
+} // end of anonymous namespace
+
+template <typename fnT, typename argTy>
+struct AscendingRadixSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined) {
+            return sort_axis1_contig_caller</*ascending*/ true, argTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy>
+struct DescendingRadixSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined) {
+            return sort_axis1_contig_caller</*ascending*/ false, argTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_radix_sort_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchVectorBuilder<
+        sort_contig_fn_ptr_t, AscendingRadixSortContigFactory, td_ns::num_types>
+        dtv1;
+    dtv1.populate_dispatch_vector(ascending_radix_sort_contig_dispatch_vector);
+
+    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
+                                 DescendingRadixSortContigFactory,
+                                 td_ns::num_types>
+        dtv2;
+    dtv2.populate_dispatch_vector(descending_radix_sort_contig_dispatch_vector);
+}
+
+bool py_radix_sort_defined(int typenum)
+{
+    const auto &array_types = td_ns::usm_ndarray_types();
+
+    try {
+        int type_id = array_types.typenum_to_lookup_id(typenum);
+        return (nullptr !=
+                ascending_radix_sort_contig_dispatch_vector[type_id]);
+    } catch (const std::exception &e) {
+        return false;
+    }
+}
+
+void init_radix_sort_functions(py::module_ m)
+{
+    init_radix_sort_dispatch_vectors();
+
+    auto py_radix_sort_ascending = [](const dpctl::tensor::usm_ndarray &src,
+                                      const int trailing_dims_to_sort,
+                                      const dpctl::tensor::usm_ndarray &dst,
+                                      sycl::queue &exec_q,
+                                      const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                       ascending_radix_sort_contig_dispatch_vector);
+    };
+    m.def("_radix_sort_ascending", py_radix_sort_ascending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_radix_sort_descending = [](const dpctl::tensor::usm_ndarray &src,
+                                       const int trailing_dims_to_sort,
+                                       const dpctl::tensor::usm_ndarray &dst,
+                                       sycl::queue &exec_q,
+                                       const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends,
+                       descending_radix_sort_contig_dispatch_vector);
+    };
+    m.def("_radix_sort_descending", py_radix_sort_descending, py::arg("src"),
+          py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_radix_sort_dtype_supported", py_radix_sort_defined);
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/radix_sort.hpp b/dpnp/tensor/libtensor/source/sorting/radix_sort.hpp
new file mode 100644
index 000000000000..5f3c771b464b
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/radix_sort.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_radix_sort_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/radix_sort_support.hpp b/dpnp/tensor/libtensor/source/sorting/radix_sort_support.hpp
new file mode 100644
index 000000000000..8d7e55a5cd28
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/radix_sort_support.hpp
@@ -0,0 +1,78 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl::tensor::py_internal
+{
+
+template <typename Ty, typename ArgTy>
+struct TypeDefinedEntry : std::bool_constant<std::is_same_v<Ty, ArgTy>>
+{
+    static constexpr bool is_defined = true;
+};
+
+struct NotDefinedEntry : std::true_type
+{
+    static constexpr bool is_defined = false;
+};
+
+template <typename T>
+struct RadixSortSupportVector
+{
+    using resolver_t =
+        typename std::disjunction<TypeDefinedEntry<T, bool>,
+                                  TypeDefinedEntry<T, std::int8_t>,
+                                  TypeDefinedEntry<T, std::uint8_t>,
+                                  TypeDefinedEntry<T, std::int16_t>,
+                                  TypeDefinedEntry<T, std::uint16_t>,
+                                  TypeDefinedEntry<T, std::int32_t>,
+                                  TypeDefinedEntry<T, std::uint32_t>,
+                                  TypeDefinedEntry<T, std::int64_t>,
+                                  TypeDefinedEntry<T, std::uint64_t>,
+                                  TypeDefinedEntry<T, sycl::half>,
+                                  TypeDefinedEntry<T, float>,
+                                  TypeDefinedEntry<T, double>,
+                                  NotDefinedEntry>;
+
+    static constexpr bool is_defined = resolver_t::is_defined;
+};
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp b/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp
new file mode 100644
index 000000000000..6c50b0cbc08c
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/searchsorted.cpp
@@ -0,0 +1,473 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/sorting/searchsorted.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/rich_comparisons.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace detail
+{
+
+using dpctl::tensor::kernels::searchsorted_contig_impl_fp_ptr_t;
+
+static searchsorted_contig_impl_fp_ptr_t
+    left_side_searchsorted_contig_impl[td_ns::num_types][td_ns::num_types];
+
+static searchsorted_contig_impl_fp_ptr_t
+    right_side_searchsorted_contig_impl[td_ns::num_types][td_ns::num_types];
+
+template <typename fnT, typename argTy, typename indTy>
+struct LeftSideSearchSortedContigFactory
+{
+    constexpr LeftSideSearchSortedContigFactory() {}
+
+    fnT get() const
+    {
+        if constexpr (std::is_same_v<indTy, std::int32_t> ||
+                      std::is_same_v<indTy, std::int64_t>) {
+            static constexpr bool left_side_search(true);
+            using dpctl::tensor::kernels::searchsorted_contig_impl;
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+
+            using Compare = typename AscendingSorter<argTy>::type;
+
+            return searchsorted_contig_impl<argTy, indTy, left_side_search,
+                                            Compare>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename indTy>
+struct RightSideSearchSortedContigFactory
+{
+    constexpr RightSideSearchSortedContigFactory() {}
+
+    fnT get() const
+    {
+        if constexpr (std::is_same_v<indTy, std::int32_t> ||
+                      std::is_same_v<indTy, std::int64_t>) {
+            static constexpr bool right_side_search(false);
+
+            using dpctl::tensor::kernels::searchsorted_contig_impl;
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+
+            using Compare = typename AscendingSorter<argTy>::type;
+
+            return searchsorted_contig_impl<argTy, indTy, right_side_search,
+                                            Compare>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+using dpctl::tensor::kernels::searchsorted_strided_impl_fp_ptr_t;
+
+static searchsorted_strided_impl_fp_ptr_t
+    left_side_searchsorted_strided_impl[td_ns::num_types][td_ns::num_types];
+
+static searchsorted_strided_impl_fp_ptr_t
+    right_side_searchsorted_strided_impl[td_ns::num_types][td_ns::num_types];
+
+template <typename fnT, typename argTy, typename indTy>
+struct LeftSideSearchSortedStridedFactory
+{
+    constexpr LeftSideSearchSortedStridedFactory() {}
+
+    fnT get() const
+    {
+        if constexpr (std::is_same_v<indTy, std::int32_t> ||
+                      std::is_same_v<indTy, std::int64_t>) {
+            static constexpr bool left_side_search(true);
+            using dpctl::tensor::kernels::searchsorted_strided_impl;
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+
+            using Compare = typename AscendingSorter<argTy>::type;
+
+            return searchsorted_strided_impl<argTy, indTy, left_side_search,
+                                             Compare>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename indTy>
+struct RightSideSearchSortedStridedFactory
+{
+    constexpr RightSideSearchSortedStridedFactory() {}
+
+    fnT get() const
+    {
+        if constexpr (std::is_same_v<indTy, std::int32_t> ||
+                      std::is_same_v<indTy, std::int64_t>) {
+            static constexpr bool right_side_search(false);
+            using dpctl::tensor::kernels::searchsorted_strided_impl;
+            using dpctl::tensor::rich_comparisons::AscendingSorter;
+
+            using Compare = typename AscendingSorter<argTy>::type;
+
+            return searchsorted_strided_impl<argTy, indTy, right_side_search,
+                                             Compare>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_searchsorted_dispatch_table(void)
+{
+
+    // Contiguous input function dispatch
+    td_ns::DispatchTableBuilder<searchsorted_contig_impl_fp_ptr_t,
+                                LeftSideSearchSortedContigFactory,
+                                td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(left_side_searchsorted_contig_impl);
+
+    td_ns::DispatchTableBuilder<searchsorted_contig_impl_fp_ptr_t,
+                                RightSideSearchSortedContigFactory,
+                                td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(right_side_searchsorted_contig_impl);
+
+    // Strided input function dispatch
+    td_ns::DispatchTableBuilder<searchsorted_strided_impl_fp_ptr_t,
+                                LeftSideSearchSortedStridedFactory,
+                                td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(left_side_searchsorted_strided_impl);
+
+    td_ns::DispatchTableBuilder<searchsorted_strided_impl_fp_ptr_t,
+                                RightSideSearchSortedStridedFactory,
+                                td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(right_side_searchsorted_strided_impl);
+}
+
+} // namespace detail
+
+/*! @brief search for needle from needles in sorted hay */
+std::pair<sycl::event, sycl::event>
+    py_searchsorted(const dpctl::tensor::usm_ndarray &hay,
+                    const dpctl::tensor::usm_ndarray &needles,
+                    const dpctl::tensor::usm_ndarray &positions,
+                    sycl::queue &exec_q,
+                    const bool search_left_side,
+                    const std::vector<sycl::event> &depends)
+{
+    const int hay_nd = hay.get_ndim();
+    const int needles_nd = needles.get_ndim();
+    const int positions_nd = positions.get_ndim();
+
+    if (hay_nd != 1 || needles_nd != positions_nd) {
+        throw py::value_error("Array dimensions mismatch");
+    }
+
+    // check that needle and positions have the same shape
+    std::size_t needles_nelems(1);
+    bool same_shape(true);
+
+    const std::size_t hay_nelems = static_cast<std::size_t>(hay.get_shape(0));
+
+    const py::ssize_t *needles_shape_ptr = needles.get_shape_raw();
+    const py::ssize_t *positions_shape_ptr = positions.get_shape_raw();
+
+    for (int i = 0; (i < needles_nd) && same_shape; ++i) {
+        const auto needles_sh_i = needles_shape_ptr[i];
+        const auto positions_sh_i = positions_shape_ptr[i];
+
+        same_shape = same_shape && (needles_sh_i == positions_sh_i);
+        needles_nelems *= static_cast<std::size_t>(needles_sh_i);
+    }
+
+    if (!same_shape) {
+        throw py::value_error(
+            "Array of values to search for and array of their "
+            "positions do not have the same shape");
+    }
+
+    // check that positions is ample enough
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(positions,
+                                                               needles_nelems);
+
+    // check that positions is writable
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(positions);
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {hay, needles, positions})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    // if output array overlaps with input arrays, race condition results
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(positions, hay) || overlap(positions, needles)) {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    const int hay_typenum = hay.get_typenum();
+    const int needles_typenum = needles.get_typenum();
+    const int positions_typenum = positions.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    const int hay_typeid = array_types.typenum_to_lookup_id(hay_typenum);
+    const int needles_typeid =
+        array_types.typenum_to_lookup_id(needles_typenum);
+    const int positions_typeid =
+        array_types.typenum_to_lookup_id(positions_typenum);
+
+    // check hay and needle have the same data-type
+    if (needles_typeid != hay_typeid) {
+        throw py::value_error(
+            "Hay array and needles array must have the same data types");
+    }
+    // check that positions has indexing data-type (int32, or int64)
+    const auto positions_typenum_t_v =
+        static_cast<td_ns::typenum_t>(positions_typeid);
+    if (positions_typenum_t_v != td_ns::typenum_t::INT32 &&
+        positions_typenum_t_v != td_ns::typenum_t::INT64) {
+        throw py::value_error(
+            "Positions array must have data-type int32, or int64");
+    }
+
+    if (needles_nelems == 0) {
+        // Nothing to do
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    // if all inputs are contiguous call contiguous implementations
+    // otherwise call strided implementation
+    const bool hay_is_c_contig = hay.is_c_contiguous();
+    const bool hay_is_f_contig = hay.is_f_contiguous();
+
+    const bool needles_is_c_contig = needles.is_c_contiguous();
+    const bool needles_is_f_contig = needles.is_f_contiguous();
+
+    const bool positions_is_c_contig = positions.is_c_contiguous();
+    const bool positions_is_f_contig = positions.is_f_contiguous();
+
+    const bool all_c_contig =
+        (hay_is_c_contig && needles_is_c_contig && positions_is_c_contig);
+    const bool all_f_contig =
+        (hay_is_f_contig && needles_is_f_contig && positions_is_f_contig);
+
+    const char *hay_data = hay.get_data();
+    const char *needles_data = needles.get_data();
+
+    char *positions_data = positions.get_data();
+
+    if (all_c_contig || all_f_contig) {
+        auto fn =
+            (search_left_side)
+                ? detail::left_side_searchsorted_contig_impl[hay_typeid]
+                                                            [positions_typeid]
+                : detail::right_side_searchsorted_contig_impl[hay_typeid]
+                                                             [positions_typeid];
+
+        if (fn) {
+            static constexpr py::ssize_t zero_offset(0);
+
+            sycl::event comp_ev =
+                fn(exec_q, hay_nelems, needles_nelems, hay_data, zero_offset,
+                   needles_data, zero_offset, positions_data, zero_offset,
+                   depends);
+
+            return std::make_pair(
+                dpctl::utils::keep_args_alive(exec_q, {hay, needles, positions},
+                                              {comp_ev}),
+                comp_ev);
+        }
+    }
+
+    // strided case
+
+    const auto &needles_strides = needles.get_strides_vector();
+    const auto &positions_strides = positions.get_strides_vector();
+
+    int simplified_nd = needles_nd;
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_common_shape;
+    shT simplified_needles_strides;
+    shT simplified_positions_strides;
+    py::ssize_t needles_offset(0);
+    py::ssize_t positions_offset(0);
+
+    if (simplified_nd == 0) {
+        // needles and positions have same nd
+        simplified_nd = 1;
+        simplified_common_shape.push_back(1);
+        simplified_needles_strides.push_back(0);
+        simplified_positions_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(
+            // modified by reference
+            simplified_nd,
+            // read-only inputs
+            needles_shape_ptr, needles_strides, positions_strides,
+            // output, modified by reference
+            simplified_common_shape, simplified_needles_strides,
+            simplified_positions_strides, needles_offset, positions_offset);
+    }
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // vectors being packed
+        simplified_common_shape, simplified_needles_strides,
+        simplified_positions_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_strides_ev =
+        std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    auto strided_fn =
+        (search_left_side)
+            ? detail::left_side_searchsorted_strided_impl[hay_typeid]
+                                                         [positions_typeid]
+            : detail::right_side_searchsorted_strided_impl[hay_typeid]
+                                                          [positions_typeid];
+
+    if (!strided_fn) {
+        throw std::runtime_error(
+            "No implementation for data types of input arrays");
+    }
+
+    static constexpr py::ssize_t zero_offset(0);
+    py::ssize_t hay_step = hay.get_strides_vector()[0];
+
+    const sycl::event &comp_ev = strided_fn(
+        exec_q, hay_nelems, needles_nelems, hay_data, zero_offset, hay_step,
+        needles_data, needles_offset, positions_data, positions_offset,
+        simplified_nd, packed_shape_strides, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {comp_ev}, packed_shape_strides_owner);
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+    const sycl::event &ht_ev = dpctl::utils::keep_args_alive(
+        exec_q, {hay, needles, positions}, host_task_events);
+
+    return std::make_pair(ht_ev, comp_ev);
+}
+
+/*! @brief search for needle from needles in sorted hay,
+ *         hay[pos] <= needle < hay[pos + 1]
+ */
+std::pair<sycl::event, sycl::event>
+    py_searchsorted_left(const dpctl::tensor::usm_ndarray &hay,
+                         const dpctl::tensor::usm_ndarray &needles,
+                         const dpctl::tensor::usm_ndarray &positions,
+                         sycl::queue &exec_q,
+                         const std::vector<sycl::event> &depends)
+{
+    static constexpr bool side_left(true);
+    return py_searchsorted(hay, needles, positions, exec_q, side_left, depends);
+}
+
+/*! @brief search for needle from needles in sorted hay,
+ *         hay[pos] < needle <= hay[pos + 1]
+ */
+std::pair<sycl::event, sycl::event>
+    py_searchsorted_right(const dpctl::tensor::usm_ndarray &hay,
+                          const dpctl::tensor::usm_ndarray &needles,
+                          const dpctl::tensor::usm_ndarray &positions,
+                          sycl::queue &exec_q,
+                          const std::vector<sycl::event> &depends)
+{
+    static constexpr bool side_right(false);
+    return py_searchsorted(hay, needles, positions, exec_q, side_right,
+                           depends);
+}
+
+void init_searchsorted_functions(py::module_ m)
+{
+    detail::init_searchsorted_dispatch_table();
+
+    m.def("_searchsorted_left", &py_searchsorted_left, py::arg("hay"),
+          py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+    m.def("_searchsorted_right", &py_searchsorted_right, py::arg("hay"),
+          py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/searchsorted.hpp b/dpnp/tensor/libtensor/source/sorting/searchsorted.hpp
new file mode 100644
index 000000000000..b60dae1e0ec9
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/searchsorted.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_searchsorted_functions(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/topk.cpp b/dpnp/tensor/libtensor/source/sorting/topk.cpp
new file mode 100644
index 000000000000..6b8344df12c8
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/topk.cpp
@@ -0,0 +1,303 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/sorting/topk.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/rich_comparisons.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "topk.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+typedef sycl::event (*topk_impl_fn_ptr_t)(sycl::queue &,
+                                          std::size_t,
+                                          std::size_t,
+                                          std::size_t,
+                                          bool,
+                                          const char *,
+                                          char *,
+                                          char *,
+                                          const std::vector<sycl::event> &);
+
+static topk_impl_fn_ptr_t topk_dispatch_vector[td_ns::num_types];
+
+namespace
+{
+
+template <typename T, typename = void>
+struct use_radix_sort : public std::false_type
+{
+};
+
+template <typename T>
+struct use_radix_sort<
+    T,
+    std::enable_if_t<std::disjunction<std::is_same<T, bool>,
+                                      std::is_same<T, std::uint8_t>,
+                                      std::is_same<T, std::int8_t>,
+                                      std::is_same<T, std::uint16_t>,
+                                      std::is_same<T, std::int16_t>>::value>>
+    : public std::true_type
+{
+};
+
+template <typename argTy, typename IndexTy>
+sycl::event topk_caller(sycl::queue &exec_q,
+                        std::size_t iter_nelems, // number of sub-arrays
+                        std::size_t axis_nelems, // size of each sub-array
+                        std::size_t k,
+                        bool largest,
+                        const char *arg_cp,
+                        char *vals_cp,
+                        char *inds_cp,
+                        const std::vector<sycl::event> &depends)
+{
+    if constexpr (use_radix_sort<argTy>::value) {
+        using dpctl::tensor::kernels::topk_radix_impl;
+        auto ascending = !largest;
+        return topk_radix_impl<argTy, IndexTy>(exec_q, iter_nelems, axis_nelems,
+                                               k, ascending, arg_cp, vals_cp,
+                                               inds_cp, depends);
+    }
+    else {
+        using dpctl::tensor::kernels::topk_merge_impl;
+        if (largest) {
+            using CompTy =
+                typename dpctl::tensor::rich_comparisons::DescendingSorter<
+                    argTy>::type;
+            return topk_merge_impl<argTy, IndexTy, CompTy>(
+                exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
+                depends);
+        }
+        else {
+            using CompTy =
+                typename dpctl::tensor::rich_comparisons::AscendingSorter<
+                    argTy>::type;
+            return topk_merge_impl<argTy, IndexTy, CompTy>(
+                exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
+                depends);
+        }
+    }
+}
+
+} // namespace
+
+std::pair<sycl::event, sycl::event>
+    py_topk(const dpctl::tensor::usm_ndarray &src,
+            std::optional<const int> trailing_dims_to_search,
+            const std::size_t k,
+            const bool largest,
+            const dpctl::tensor::usm_ndarray &vals,
+            const dpctl::tensor::usm_ndarray &inds,
+            sycl::queue &exec_q,
+            const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int vals_nd = vals.get_ndim();
+    int inds_nd = inds.get_ndim();
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *vals_shape_ptr = vals.get_shape_raw();
+    const py::ssize_t *inds_shape_ptr = inds.get_shape_raw();
+
+    std::size_t axis_nelems(1);
+    std::size_t iter_nelems(1);
+    if (trailing_dims_to_search.has_value()) {
+        if (src_nd != vals_nd || src_nd != inds_nd) {
+            throw py::value_error("The input and output arrays must have "
+                                  "the same array ranks");
+        }
+
+        auto trailing_dims = trailing_dims_to_search.value();
+        int iter_nd = src_nd - trailing_dims;
+        if (trailing_dims <= 0 || iter_nd < 0) {
+            throw py::value_error(
+                "trailing_dims_to_search must be positive, but no "
+                "greater than rank of the array being searched");
+        }
+
+        bool same_shapes = true;
+        for (int i = 0; same_shapes && (i < iter_nd); ++i) {
+            auto src_shape_i = src_shape_ptr[i];
+            same_shapes = same_shapes && (src_shape_i == vals_shape_ptr[i] &&
+                                          src_shape_i == inds_shape_ptr[i]);
+            iter_nelems *= static_cast<std::size_t>(src_shape_i);
+        }
+
+        if (!same_shapes) {
+            throw py::value_error(
+                "Destination shape does not match the input shape");
+        }
+
+        std::size_t vals_k(1);
+        std::size_t inds_k(1);
+        for (int i = iter_nd; i < src_nd; ++i) {
+            axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+            vals_k *= static_cast<std::size_t>(vals_shape_ptr[i]);
+            inds_k *= static_cast<std::size_t>(inds_shape_ptr[i]);
+        }
+
+        bool valid_k = (vals_k == k && inds_k == k && axis_nelems >= k);
+        if (!valid_k) {
+            throw py::value_error("The value of k is invalid for the input and "
+                                  "destination arrays");
+        }
+    }
+    else {
+        if (vals_nd != 1 || inds_nd != 1) {
+            throw py::value_error("Output arrays must be one-dimensional");
+        }
+
+        for (int i = 0; i < src_nd; ++i) {
+            axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+        }
+
+        bool valid_k = (axis_nelems >= k &&
+                        static_cast<std::size_t>(vals_shape_ptr[0]) == k &&
+                        static_cast<std::size_t>(inds_shape_ptr[0]) == k);
+        if (!valid_k) {
+            throw py::value_error("The value of k is invalid for the input and "
+                                  "destination arrays");
+        }
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(vals);
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(inds);
+
+    if ((iter_nelems == 0) || (axis_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, vals) || overlap(src, inds)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(vals,
+                                                               k * iter_nelems);
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(inds,
+                                                               k * iter_nelems);
+
+    int src_typenum = src.get_typenum();
+    int vals_typenum = vals.get_typenum();
+    int inds_typenum = inds.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int vals_typeid = array_types.typenum_to_lookup_id(vals_typenum);
+    int inds_typeid = array_types.typenum_to_lookup_id(inds_typenum);
+
+    if (src_typeid != vals_typeid) {
+        throw py::value_error("Input array and vals array must have "
+                              "the same data type");
+    }
+
+    if (inds_typeid != static_cast<int>(td_ns::typenum_t::INT64)) {
+        throw py::value_error("Inds array must have data type int64");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_vals_c_contig = vals.is_c_contiguous();
+    bool is_inds_c_contig = inds.is_c_contiguous();
+
+    if (is_src_c_contig && is_vals_c_contig && is_inds_c_contig) {
+        auto fn = topk_dispatch_vector[src_typeid];
+
+        sycl::event comp_ev =
+            fn(exec_q, iter_nelems, axis_nelems, k, largest, src.get_data(),
+               vals.get_data(), inds.get_data(), depends);
+
+        sycl::event keep_args_alive_ev =
+            dpctl::utils::keep_args_alive(exec_q, {src, vals, inds}, {comp_ev});
+
+        return std::make_pair(keep_args_alive_ev, comp_ev);
+    }
+
+    return std::make_pair(sycl::event(), sycl::event());
+}
+
+template <typename fnT, typename T>
+struct TopKFactory
+{
+    fnT get()
+    {
+        using IdxT = std::int64_t;
+        return topk_caller<T, IdxT>;
+    }
+};
+
+void init_topk_dispatch_vectors(void)
+{
+    td_ns::DispatchVectorBuilder<topk_impl_fn_ptr_t, TopKFactory,
+                                 td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(topk_dispatch_vector);
+}
+
+void init_topk_functions(py::module_ m)
+{
+    init_topk_dispatch_vectors();
+
+    m.def("_topk", &py_topk, py::arg("src"), py::arg("trailing_dims_to_search"),
+          py::arg("k"), py::arg("largest"), py::arg("vals"), py::arg("inds"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/sorting/topk.hpp b/dpnp/tensor/libtensor/source/sorting/topk.hpp
new file mode 100644
index 000000000000..d39c0eefdb93
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/sorting/topk.hpp
@@ -0,0 +1,47 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_topk_functions(py::module_);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/tensor_accumulation.cpp b/dpnp/tensor/libtensor/source/tensor_accumulation.cpp
new file mode 100644
index 000000000000..faa3fc8b52c6
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/tensor_accumulation.cpp
@@ -0,0 +1,43 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_accumulation_impl
+//  extensions
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "accumulators/accumulators_common.hpp"
+
+PYBIND11_MODULE(_tensor_accumulation_impl, m)
+{
+    dpctl::tensor::py_internal::init_accumulator_functions(m);
+}
diff --git a/dpnp/tensor/libtensor/source/tensor_ctors.cpp b/dpnp/tensor/libtensor/source/tensor_ctors.cpp
new file mode 100644
index 000000000000..cdd6e43ed9c5
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/tensor_ctors.cpp
@@ -0,0 +1,497 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <optional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+#include "accumulators.hpp"
+#include "boolean_advanced_indexing.hpp"
+#include "clip.hpp"
+#include "copy_and_cast_usm_to_usm.hpp"
+#include "copy_as_contig.hpp"
+#include "copy_for_reshape.hpp"
+#include "copy_for_roll.hpp"
+#include "copy_numpy_ndarray_into_usm_ndarray.hpp"
+#include "device_support_queries.hpp"
+#include "eye_ctor.hpp"
+#include "full_ctor.hpp"
+#include "integer_advanced_indexing.hpp"
+#include "kernels/dpctl_tensor_types.hpp"
+#include "linear_sequences.hpp"
+#include "repeat.hpp"
+#include "simplify_iteration_space.hpp"
+#include "triul_ctor.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/strided_iters.hpp"
+#include "where.hpp"
+#include "zeros_ctor.hpp"
+
+namespace py = pybind11;
+
+static_assert(std::is_same_v<py::ssize_t, dpctl::tensor::ssize_t>);
+
+namespace
+{
+
+using dpctl::tensor::overlap::MemoryOverlap;
+using dpctl::tensor::overlap::SameLogicalTensors;
+
+using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray;
+using dpctl::tensor::py_internal::py_as_c_contig;
+using dpctl::tensor::py_internal::py_as_f_contig;
+
+/* =========================== Copy for reshape ============================= */
+
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
+
+/* =========================== Copy for roll ============================= */
+
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d;
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd;
+
+/* ============= Copy from numpy.ndarray to usm_ndarray ==================== */
+
+using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
+
+/* ============= linear-sequence ==================== */
+
+using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
+using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
+
+/* ================ Full ================== */
+
+using dpctl::tensor::py_internal::usm_ndarray_full;
+
+/* ================ Zeros ================== */
+
+using dpctl::tensor::py_internal::usm_ndarray_zeros;
+
+/* ============== Advanced Indexing ============= */
+using dpctl::tensor::py_internal::usm_ndarray_put;
+using dpctl::tensor::py_internal::usm_ndarray_take;
+
+using dpctl::tensor::py_internal::py_extract;
+using dpctl::tensor::py_internal::py_mask_positions;
+using dpctl::tensor::py_internal::py_nonzero;
+using dpctl::tensor::py_internal::py_place;
+
+/* ================= Repeat ====================*/
+using dpctl::tensor::py_internal::py_cumsum_1d;
+using dpctl::tensor::py_internal::py_repeat_by_scalar;
+using dpctl::tensor::py_internal::py_repeat_by_sequence;
+
+/* ================ Eye ================== */
+
+using dpctl::tensor::py_internal::usm_ndarray_eye;
+
+/* =========================== Tril and triu ============================== */
+
+using dpctl::tensor::py_internal::usm_ndarray_triul;
+
+/* =========================== Where ============================== */
+
+using dpctl::tensor::py_internal::py_where;
+
+/* =========================== Clip ============================== */
+using dpctl::tensor::py_internal::py_clip;
+
+// populate dispatch tables
+void init_dispatch_tables(void)
+{
+    using namespace dpctl::tensor::py_internal;
+
+    init_copy_and_cast_usm_to_usm_dispatch_tables();
+    init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
+    init_advanced_indexing_dispatch_tables();
+    init_where_dispatch_tables();
+    return;
+}
+
+// populate dispatch vectors
+void init_dispatch_vectors(void)
+{
+    using namespace dpctl::tensor::py_internal;
+
+    init_copy_as_contig_dispatch_vectors();
+    init_copy_for_reshape_dispatch_vectors();
+    init_copy_for_roll_dispatch_vectors();
+    init_linear_sequences_dispatch_vectors();
+    init_full_ctor_dispatch_vectors();
+    init_zeros_ctor_dispatch_vectors();
+    init_eye_ctor_dispatch_vectors();
+    init_triul_ctor_dispatch_vectors();
+
+    populate_masked_extract_dispatch_vectors();
+    populate_masked_place_dispatch_vectors();
+
+    populate_mask_positions_dispatch_vectors();
+
+    populate_cumsum_1d_dispatch_vectors();
+    init_repeat_dispatch_vectors();
+
+    init_clip_dispatch_vectors();
+
+    return;
+}
+
+} // namespace
+
+PYBIND11_MODULE(_tensor_impl, m)
+{
+    init_dispatch_tables();
+    init_dispatch_vectors();
+
+    using dpctl::tensor::strides::contract_iter;
+    m.def(
+        "_contract_iter", &contract_iter<py::ssize_t, py::value_error>,
+        "Simplifies iteration of array of given shape & stride. Returns "
+        "a triple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension, which traverses the same elements as the original "
+        "iterator, possibly in a different order.");
+
+    m.def("_copy_usm_ndarray_into_usm_ndarray",
+          &copy_usm_ndarray_into_usm_ndarray,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` of the same "
+          "shape. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_as_c_contig", &py_as_c_contig,
+          "Copies from usm_ndarray `src` into C-contiguous usm_ndarray "
+          "`dst` of the same shape and the same data type. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_as_f_contig", &py_as_f_contig,
+          "Copies from usm_ndarray `src` into F-contiguous usm_ndarray "
+          "`dst` of the same shape and the same data type. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    using dpctl::tensor::strides::contract_iter2;
+    m.def(
+        "_contract_iter2", &contract_iter2<py::ssize_t, py::value_error>,
+        "Simplifies iteration over elements of pair of arrays of given shape "
+        "with strides stride1 and stride2. Returns "
+        "a 5-tuple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension for each array, which traverses the same elements "
+        "as the original "
+        "iterator, possibly in a different order.");
+
+    using dpctl::tensor::strides::contract_iter3;
+    m.def(
+        "_contract_iter3", &contract_iter3<py::ssize_t, py::value_error>,
+        "Simplifies iteration over elements of 3-tuple of arrays of given "
+        "shape "
+        "with strides stride1, stride2, and stride3. Returns "
+        "a 7-tuple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension for each array, which traverses the same elements "
+        "as the original "
+        "iterator, possibly in a different order.");
+
+    using dpctl::tensor::strides::contract_iter4;
+    m.def(
+        "_contract_iter4", &contract_iter4<py::ssize_t, py::value_error>,
+        "Simplifies iteration over elements of 4-tuple of arrays of given "
+        "shape "
+        "with strides stride1, stride2, stride3, and stride4. Returns "
+        "a 9-tuple: shape, stride and offset for the new iterator of possible "
+        "smaller dimension for each array, which traverses the same elements "
+        "as the original "
+        "iterator, possibly in a different order.");
+
+    static constexpr char orderC = 'C';
+    m.def(
+        "_ravel_multi_index",
+        [](const std::vector<py::ssize_t> &mi,
+           const std::vector<py::ssize_t> &shape, char order = 'C') {
+            if (order == orderC) {
+                return dpctl::tensor::py_internal::_ravel_multi_index_c(mi,
+                                                                        shape);
+            }
+            else {
+                return dpctl::tensor::py_internal::_ravel_multi_index_f(mi,
+                                                                        shape);
+            }
+        },
+        "");
+
+    m.def(
+        "_unravel_index",
+        [](py::ssize_t flat_index, const std::vector<py::ssize_t> &shape,
+           char order = 'C') {
+            if (order == orderC) {
+                return dpctl::tensor::py_internal::_unravel_index_c(flat_index,
+                                                                    shape);
+            }
+            else {
+                return dpctl::tensor::py_internal::_unravel_index_f(flat_index,
+                                                                    shape);
+            }
+        },
+        "");
+
+    m.def("_copy_usm_ndarray_for_reshape", &copy_usm_ndarray_for_reshape,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "number of elements using underlying 'C'-contiguous order for flat "
+          "traversal. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "shapes using underlying 'C'-contiguous order for flat "
+          "traversal with shift. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("shift"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_copy_usm_ndarray_for_roll_nd", &copy_usm_ndarray_for_roll_nd,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "shapes using underlying 'C'-contiguous order for "
+          "traversal with shifts along each axis. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("shifts"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
+          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+          "specified by "
+          "starting point `start` and step `dt`. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("start"), py::arg("dt"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
+          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
+          "specified by "
+          "starting point `start` and end point `end`. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("start"), py::arg("end"), py::arg("dst"),
+          py::arg("include_endpoint"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_copy_numpy_ndarray_into_usm_ndarray",
+          &copy_numpy_ndarray_into_usm_ndarray,
+          "Copy from numpy array `src` into usm_ndarray `dst` synchronously.",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
+          "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_full_usm_ndarray", &usm_ndarray_full,
+          "Populate usm_ndarray `dst` with given fill_value.",
+          py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_take", &usm_ndarray_take,
+          "Takes elements at usm_ndarray indices `ind` and axes starting "
+          "at axis `axis_start` from array `src` and copies them "
+          "into usm_ndarray `dst` synchronously."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("src"), py::arg("ind"), py::arg("dst"), py::arg("axis_start"),
+          py::arg("mode"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_put", &usm_ndarray_put,
+          "Puts elements at usm_ndarray indices `ind` and axes starting "
+          "at axis `axis_start` into array `dst` from "
+          "usm_ndarray `val` synchronously."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("dst"), py::arg("ind"), py::arg("val"), py::arg("axis_start"),
+          py::arg("mode"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_eye", &usm_ndarray_eye,
+          "Fills input 2D contiguous usm_ndarray `dst` with "
+          "zeros outside of the diagonal "
+          "specified by "
+          "the diagonal index `k` "
+          "which is filled with ones."
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("k"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("default_device_fp_type",
+          dpctl::tensor::py_internal::default_device_fp_type,
+          "Gives default floating point type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_int_type",
+          dpctl::tensor::py_internal::default_device_int_type,
+          "Gives default signed integer type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_uint_type",
+          dpctl::tensor::py_internal::default_device_uint_type,
+          "Gives default unsigned integer type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_bool_type",
+          dpctl::tensor::py_internal::default_device_bool_type,
+          "Gives default boolean type supported by device.", py::arg("dev"));
+
+    m.def("default_device_complex_type",
+          dpctl::tensor::py_internal::default_device_complex_type,
+          "Gives default complex floating point type supported by device.",
+          py::arg("dev"));
+
+    m.def("default_device_index_type",
+          dpctl::tensor::py_internal::default_device_index_type,
+          "Gives default index type supported by device.", py::arg("dev"));
+
+    auto tril_fn = [](const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends);
+    };
+    m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"),
+          py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto triu_fn = [](const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends);
+    };
+    m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
+          py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
+          py::arg("cumsum"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
+          py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto overlap = [](const dpctl::tensor::usm_ndarray &x1,
+                      const dpctl::tensor::usm_ndarray &x2) -> bool {
+        auto const &overlap = MemoryOverlap();
+        return overlap(x1, x2);
+    };
+    m.def("_array_overlap", overlap,
+          "Determines if the memory regions indexed by each array overlap",
+          py::arg("array1"), py::arg("array2"));
+
+    auto same_logical_tensors =
+        [](const dpctl::tensor::usm_ndarray &x1,
+           const dpctl::tensor::usm_ndarray &x2) -> bool {
+        auto const &same_logical_tensors = SameLogicalTensors();
+        return same_logical_tensors(x1, x2);
+    };
+    m.def("_same_logical_tensors", same_logical_tensors,
+          "Determines if the memory regions indexed by each array are the same",
+          py::arg("array1"), py::arg("array2"));
+
+    m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
+          py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"),
+          py::arg("mask_shape"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"),
+          py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src,
+                              const dpctl::tensor::usm_ndarray &dst,
+                              const dpctl::tensor::usm_ndarray &reps,
+                              const dpctl::tensor::usm_ndarray &cumsum,
+                              std::optional<int> axis, sycl::queue &exec_q,
+                              const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        if (axis) {
+            return py_repeat_by_sequence(src, dst, reps, cumsum, axis.value(),
+                                         exec_q, depends);
+        }
+        else {
+            return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q,
+                                         depends);
+        }
+    };
+    m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"),
+          py::arg("dst"), py::arg("reps"), py::arg("cumsum"), py::arg("axis"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src,
+                            const dpctl::tensor::usm_ndarray &dst,
+                            const py::ssize_t reps, std::optional<int> axis,
+                            sycl::queue &exec_q,
+                            const std::vector<sycl::event> depends)
+        -> std::pair<sycl::event, sycl::event> {
+        if (axis) {
+            return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q,
+                                       depends);
+        }
+        else {
+            return py_repeat_by_scalar(src, dst, reps, exec_q, depends);
+        }
+    };
+    m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"),
+          py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_clip", &py_clip,
+          "Clamps elements of array `x` to the range "
+          "[`min`, `max] and writes the result to the "
+          "array `dst` for each element of `x`, `min`, and `max`."
+          "Returns a tuple of events: (hev, ev)",
+          py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+}
diff --git a/dpnp/tensor/libtensor/source/tensor_elementwise.cpp b/dpnp/tensor/libtensor/source/tensor_elementwise.cpp
new file mode 100644
index 000000000000..76b9916ca9d3
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/tensor_elementwise.cpp
@@ -0,0 +1,45 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "elementwise_functions/elementwise_common.hpp"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_tensor_elementwise_impl, m)
+{
+    dpctl::tensor::py_internal::init_elementwise_functions(m);
+}
diff --git a/dpnp/tensor/libtensor/source/tensor_linalg.cpp b/dpnp/tensor/libtensor/source/tensor_linalg.cpp
new file mode 100644
index 000000000000..4a1b5fb79b9e
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/tensor_linalg.cpp
@@ -0,0 +1,41 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include "linalg_functions/dot.hpp"
+#include <pybind11/pybind11.h>
+
+PYBIND11_MODULE(_tensor_linalg_impl, m)
+{
+    dpctl::tensor::py_internal::init_dot(m);
+}
diff --git a/dpnp/tensor/libtensor/source/tensor_reductions.cpp b/dpnp/tensor/libtensor/source/tensor_reductions.cpp
new file mode 100644
index 000000000000..6e6a24f7b934
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/tensor_reductions.cpp
@@ -0,0 +1,43 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_reductions_impl
+/// extension.
+//===---------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "reductions/reduction_common.hpp"
+
+PYBIND11_MODULE(_tensor_reductions_impl, m)
+{
+    dpctl::tensor::py_internal::init_reduction_functions(m);
+}
diff --git a/dpnp/tensor/libtensor/source/tensor_sorting.cpp b/dpnp/tensor/libtensor/source/tensor_sorting.cpp
new file mode 100644
index 000000000000..318c3559d77c
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/tensor_sorting.cpp
@@ -0,0 +1,55 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <pybind11/pybind11.h>
+
+#include "sorting/isin.hpp"
+#include "sorting/merge_argsort.hpp"
+#include "sorting/merge_sort.hpp"
+#include "sorting/radix_argsort.hpp"
+#include "sorting/radix_sort.hpp"
+#include "sorting/searchsorted.hpp"
+#include "sorting/topk.hpp"
+
+PYBIND11_MODULE(_tensor_sorting_impl, m)
+{
+    dpctl::tensor::py_internal::init_isin_functions(m);
+    dpctl::tensor::py_internal::init_merge_sort_functions(m);
+    dpctl::tensor::py_internal::init_merge_argsort_functions(m);
+    dpctl::tensor::py_internal::init_searchsorted_functions(m);
+    dpctl::tensor::py_internal::init_radix_sort_functions(m);
+    dpctl::tensor::py_internal::init_radix_argsort_functions(m);
+    dpctl::tensor::py_internal::init_topk_functions(m);
+}
diff --git a/dpnp/tensor/libtensor/source/triul_ctor.cpp b/dpnp/tensor/libtensor/source/triul_ctor.cpp
new file mode 100644
index 000000000000..13e909196460
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/triul_ctor.cpp
@@ -0,0 +1,246 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <algorithm> // for std::copy
+#include <cstddef>   // for std::size_t
+#include <iterator>  // for std::begin, std::end
+#include <memory>    // for std::make_shared
+#include <utility>   // for std::pair, std::move
+#include <vector>    // for std::vector, std::begin, std::end
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/constructors.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+using dpctl::tensor::kernels::constructors::tri_fn_ptr_t;
+
+static tri_fn_ptr_t tril_generic_dispatch_vector[td_ns::num_types];
+static tri_fn_ptr_t triu_generic_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_triul(sycl::queue &exec_q,
+                      const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst,
+                      char part,
+                      py::ssize_t k = 0,
+                      const std::vector<sycl::event> &depends = {})
+{
+    // array dimensions must be the same
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    if (src_nd < 2) {
+        throw py::value_error("Array dimensions less than 2.");
+    }
+
+    // shapes must be the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+
+    for (int i = 0; shapes_equal && i < src_nd; ++i) {
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        // TODO: could use a temporary, but this is done by the caller
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (dst_typeid != src_typeid) {
+        throw py::value_error("Array dtype are not the same.");
+    }
+
+    // check same queues
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue context is not the same as allocation contexts");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto src_strides = src.get_strides_vector();
+    auto dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd - 2;
+    const py::ssize_t *shape = src_shape;
+
+    const shT iter_src_strides(std::begin(src_strides),
+                               std::begin(src_strides) + nd);
+    const shT iter_dst_strides(std::begin(dst_strides),
+                               std::begin(dst_strides) + nd);
+
+    simplify_iteration_space(nd, shape, iter_src_strides, iter_dst_strides,
+                             // output
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    if (src_offset != 0 || dst_offset != 0) {
+        throw py::value_error("Reversed slice for dst is not supported");
+    }
+
+    nd += 2;
+
+    using usm_host_allocatorT =
+        dpctl::tensor::alloc_utils::usm_host_allocator<py::ssize_t>;
+    using usmshT = std::vector<py::ssize_t, usm_host_allocatorT>;
+
+    usm_host_allocatorT allocator(exec_q);
+    auto shp_host_shape_and_strides =
+        std::make_shared<usmshT>(3 * nd, allocator);
+
+    std::copy(simplified_shape.begin(), simplified_shape.end(),
+              shp_host_shape_and_strides->begin());
+    (*shp_host_shape_and_strides)[nd - 2] = src_shape[src_nd - 2];
+    (*shp_host_shape_and_strides)[nd - 1] = src_shape[src_nd - 1];
+
+    std::copy(simplified_src_strides.begin(), simplified_src_strides.end(),
+              shp_host_shape_and_strides->begin() + nd);
+    (*shp_host_shape_and_strides)[2 * nd - 2] = src_strides[src_nd - 2];
+    (*shp_host_shape_and_strides)[2 * nd - 1] = src_strides[src_nd - 1];
+
+    std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(),
+              shp_host_shape_and_strides->begin() + 2 * nd);
+    (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2];
+    (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1];
+
+    auto dev_shape_and_strides_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(3 * nd,
+                                                                     exec_q);
+    py::ssize_t *dev_shape_and_strides = dev_shape_and_strides_owner.get();
+
+    const sycl::event &copy_shape_and_strides = exec_q.copy<py::ssize_t>(
+        shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd);
+
+    py::ssize_t inner_range = src_shape[src_nd - 1] * src_shape[src_nd - 2];
+    py::ssize_t outer_range = src_nelems / inner_range;
+
+    sycl::event tri_ev;
+    if (part == 'l') {
+        auto fn = tril_generic_dispatch_vector[src_typeid];
+        tri_ev =
+            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
+               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+    }
+    else {
+        auto fn = triu_generic_dispatch_vector[src_typeid];
+        tri_ev =
+            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
+               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+    }
+
+    const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(tri_ev);
+        const auto &ctx = exec_q.get_context();
+        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
+        cgh.host_task(
+            [shp_host_shape_and_strides = std::move(shp_host_shape_and_strides),
+             dev_shape_and_strides, ctx]() {
+                // capture of shp_host_shape_and_strides ensure the underlying
+                // vector exists for the entire execution of copying kernel
+                sycl_free_noexcept(dev_shape_and_strides, ctx);
+            });
+    });
+    // since host_task now owns USM allocation, release ownership by smart
+    // pointer
+    dev_shape_and_strides_owner.release();
+
+    return std::make_pair(
+        keep_args_alive(exec_q, {src, dst}, {temporaries_cleanup_ev}), tri_ev);
+}
+
+void init_triul_ctor_dispatch_vectors(void)
+{
+
+    using namespace td_ns;
+    using dpctl::tensor::kernels::constructors::TrilGenericFactory;
+    using dpctl::tensor::kernels::constructors::TriuGenericFactory;
+
+    DispatchVectorBuilder<tri_fn_ptr_t, TrilGenericFactory, num_types> dvb1;
+    dvb1.populate_dispatch_vector(tril_generic_dispatch_vector);
+
+    DispatchVectorBuilder<tri_fn_ptr_t, TriuGenericFactory, num_types> dvb2;
+    dvb2.populate_dispatch_vector(triu_generic_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/triul_ctor.hpp b/dpnp/tensor/libtensor/source/triul_ctor.hpp
new file mode 100644
index 000000000000..47cc4ce8892d
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/triul_ctor.hpp
@@ -0,0 +1,58 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_triul(sycl::queue &exec_q,
+                      const dpctl::tensor::usm_ndarray &src,
+                      const dpctl::tensor::usm_ndarray &dst,
+                      char part,
+                      py::ssize_t k = 0,
+                      const std::vector<sycl::event> &depends = {});
+
+extern void init_triul_ctor_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/where.cpp b/dpnp/tensor/libtensor/source/where.cpp
new file mode 100644
index 000000000000..1d535a712917
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/where.cpp
@@ -0,0 +1,264 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines Python API for implementation functions of
+/// dpctl.tensor.where
+//===---------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "kernels/where.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+#include "where.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::search::where_contig_impl_fn_ptr_t;
+using dpctl::tensor::kernels::search::where_strided_impl_fn_ptr_t;
+
+static where_contig_impl_fn_ptr_t where_contig_dispatch_table[td_ns::num_types]
+                                                             [td_ns::num_types];
+static where_strided_impl_fn_ptr_t
+    where_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+using dpctl::utils::keep_args_alive;
+
+std::pair<sycl::event, sycl::event>
+    py_where(const dpctl::tensor::usm_ndarray &condition,
+             const dpctl::tensor::usm_ndarray &x1,
+             const dpctl::tensor::usm_ndarray &x2,
+             const dpctl::tensor::usm_ndarray &dst,
+             sycl::queue &exec_q,
+             const std::vector<sycl::event> &depends)
+{
+
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {x1, x2, condition, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    int nd = condition.get_ndim();
+    int x1_nd = x1.get_ndim();
+    int x2_nd = x2.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (nd != x1_nd || nd != x2_nd) {
+        throw py::value_error(
+            "Input arrays are not of appropriate dimension for where kernel.");
+    }
+
+    if (nd != dst_nd) {
+        throw py::value_error(
+            "Destination is not of appropriate dimension for where kernel.");
+    }
+
+    const py::ssize_t *x1_shape = x1.get_shape_raw();
+    const py::ssize_t *x2_shape = x2.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    const py::ssize_t *cond_shape = condition.get_shape_raw();
+
+    bool shapes_equal(true);
+    std::size_t nelems(1);
+    for (int i = 0; i < nd; ++i) {
+        const auto &sh_i = dst_shape[i];
+        nelems *= static_cast<std::size_t>(sh_i);
+        shapes_equal = shapes_equal && (x1_shape[i] == sh_i) &&
+                       (x2_shape[i] == sh_i) && (cond_shape[i] == sh_i);
+    }
+
+    if (!shapes_equal) {
+        throw py::value_error("Axes are not of matching shapes.");
+    }
+
+    if (nelems == 0) {
+        return std::make_pair(sycl::event{}, sycl::event{});
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(dst, condition) && !same_logical_tensors(dst, condition)) ||
+        (overlap(dst, x1) && !same_logical_tensors(dst, x1)) ||
+        (overlap(dst, x2) && !same_logical_tensors(dst, x2))) {
+        throw py::value_error("Destination array overlaps with input.");
+    }
+
+    int x1_typenum = x1.get_typenum();
+    int x2_typenum = x2.get_typenum();
+    int cond_typenum = condition.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int cond_typeid = array_types.typenum_to_lookup_id(cond_typenum);
+    int x1_typeid = array_types.typenum_to_lookup_id(x1_typenum);
+    int x2_typeid = array_types.typenum_to_lookup_id(x2_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (x1_typeid != x2_typeid || x1_typeid != dst_typeid) {
+        throw py::value_error("Value arrays must have the same data type");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, nelems);
+
+    char *cond_data = condition.get_data();
+    char *x1_data = x1.get_data();
+    char *x2_data = x2.get_data();
+    char *dst_data = dst.get_data();
+
+    bool is_x1_c_contig = x1.is_c_contiguous();
+    bool is_x1_f_contig = x1.is_f_contiguous();
+
+    bool is_x2_c_contig = x2.is_c_contiguous();
+    bool is_x2_f_contig = x2.is_f_contiguous();
+
+    bool is_cond_c_contig = condition.is_c_contiguous();
+    bool is_cond_f_contig = condition.is_f_contiguous();
+
+    bool is_dst_c_contig = dst.is_c_contiguous();
+    bool is_dst_f_contig = dst.is_f_contiguous();
+
+    bool all_c_contig = (is_x1_c_contig && is_x2_c_contig && is_cond_c_contig &&
+                         is_dst_c_contig);
+    bool all_f_contig = (is_x1_f_contig && is_x2_f_contig && is_cond_f_contig &&
+                         is_dst_f_contig);
+
+    if (all_c_contig || all_f_contig) {
+        auto contig_fn = where_contig_dispatch_table[x1_typeid][cond_typeid];
+
+        auto where_ev = contig_fn(exec_q, nelems, cond_data, x1_data, x2_data,
+                                  dst_data, depends);
+        sycl::event ht_ev =
+            keep_args_alive(exec_q, {x1, x2, dst, condition}, {where_ev});
+
+        return std::make_pair(ht_ev, where_ev);
+    }
+
+    auto const &cond_strides = condition.get_strides_vector();
+    auto const &x1_strides = x1.get_strides_vector();
+    auto const &x2_strides = x2.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_cond_strides;
+    shT simplified_x1_strides;
+    shT simplified_x2_strides;
+    shT simplified_dst_strides;
+    py::ssize_t cond_offset(0);
+    py::ssize_t x1_offset(0);
+    py::ssize_t x2_offset(0);
+    py::ssize_t dst_offset(0);
+
+    simplify_iteration_space_4(
+        nd, x1_shape, cond_strides, x1_strides, x2_strides, dst_strides,
+        // outputs
+        simplified_shape, simplified_cond_strides, simplified_x1_strides,
+        simplified_x2_strides, simplified_dst_strides, cond_offset, x1_offset,
+        x2_offset, dst_offset);
+
+    auto fn = where_strided_dispatch_table[x1_typeid][cond_typeid];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events,
+        // common shape and strides
+        simplified_shape, simplified_cond_strides, simplified_x1_strides,
+        simplified_x2_strides, simplified_dst_strides);
+    auto packed_shape_strides_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+    all_deps.push_back(copy_shape_strides_ev);
+
+    assert(all_deps.size() == depends.size() + 1);
+
+    sycl::event where_ev = fn(exec_q, nelems, nd, cond_data, x1_data, x2_data,
+                              dst_data, packed_shape_strides, cond_offset,
+                              x1_offset, x2_offset, dst_offset, all_deps);
+
+    // free packed temporaries
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {where_ev}, packed_shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    sycl::event arg_cleanup_ev =
+        keep_args_alive(exec_q, {x1, x2, condition, dst}, host_task_events);
+
+    return std::make_pair(arg_cleanup_ev, where_ev);
+}
+
+void init_where_dispatch_tables(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::search::WhereContigFactory;
+    DispatchTableBuilder<where_contig_impl_fn_ptr_t, WhereContigFactory,
+                         num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(where_contig_dispatch_table);
+
+    using dpctl::tensor::kernels::search::WhereStridedFactory;
+    DispatchTableBuilder<where_strided_impl_fn_ptr_t, WhereStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(where_strided_dispatch_table);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/where.hpp b/dpnp/tensor/libtensor/source/where.hpp
new file mode 100644
index 000000000000..ba81d8b11642
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/where.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares Python API for implementation functions of
+/// dpctl.tensor.where
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    py_where(const dpctl::tensor::usm_ndarray &,
+             const dpctl::tensor::usm_ndarray &,
+             const dpctl::tensor::usm_ndarray &,
+             const dpctl::tensor::usm_ndarray &,
+             sycl::queue &,
+             const std::vector<sycl::event> &);
+
+extern void init_where_dispatch_tables(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/zeros_ctor.cpp b/dpnp/tensor/libtensor/source/zeros_ctor.cpp
new file mode 100644
index 000000000000..b9a2e01bea4a
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/zeros_ctor.cpp
@@ -0,0 +1,159 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "zeros_ctor.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &,
+                                             std::size_t,
+                                             char *,
+                                             const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with zeros.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param dst_p Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event zeros_contig_impl(sycl::queue &exec_q,
+                              std::size_t nelems,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+
+    static constexpr int memset_val(0);
+    sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                   nelems * sizeof(dstTy));
+    });
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty>
+struct ZerosContigFactory
+{
+    fnT get()
+    {
+        fnT f = zeros_contig_impl<Ty>;
+        return f;
+    }
+};
+
+static zeros_contig_fn_ptr_t zeros_contig_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+    usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends)
+{
+    py::ssize_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *dst_data = dst.get_data();
+
+    if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
+        auto fn = zeros_contig_dispatch_vector[dst_typeid];
+
+        sycl::event zeros_contig_event =
+            fn(exec_q, static_cast<std::size_t>(dst_nelems), dst_data, depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {dst}, {zeros_contig_event}),
+            zeros_contig_event);
+    }
+    else {
+        throw std::runtime_error(
+            "Only population of contiguous usm_ndarray objects is supported.");
+    }
+}
+
+void init_zeros_ctor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<zeros_contig_fn_ptr_t, ZerosContigFactory, num_types>
+        dvb;
+    dvb.populate_dispatch_vector(zeros_contig_dispatch_vector);
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tensor/libtensor/source/zeros_ctor.hpp b/dpnp/tensor/libtensor/source/zeros_ctor.hpp
new file mode 100644
index 000000000000..d104e37f5533
--- /dev/null
+++ b/dpnp/tensor/libtensor/source/zeros_ctor.hpp
@@ -0,0 +1,53 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends = {});
+
+extern void init_zeros_ctor_dispatch_vectors(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpnp/tests/config.py b/dpnp/tests/config.py
index a49fd8cad250..e576c643695b 100644
--- a/dpnp/tests/config.py
+++ b/dpnp/tests/config.py
@@ -4,6 +4,7 @@
 float16_types = bool(os.getenv("DPNP_TEST_FLOAT_16", 0))
 complex_types = bool(os.getenv("DPNP_TEST_COMPLEX_TYPES", 0))
 bool_types = bool(os.getenv("DPNP_TEST_BOOL_TYPES", 0))
+skip_tensor_tests = bool(int(os.getenv("SKIP_TENSOR_TESTS", 0)))
 
 
 infra_warnings_enable = bool(os.getenv("DPNP_INFRA_WARNINGS_ENABLE", 0))
diff --git a/dpnp/tests/conftest.py b/dpnp/tests/conftest.py
index 5d766566bca5..8e3cb97ad41f 100644
--- a/dpnp/tests/conftest.py
+++ b/dpnp/tests/conftest.py
@@ -97,6 +97,10 @@ def pytest_configure(config):
     # Equivalent to norecursedirs = tests_perf
     config.addinivalue_line("norecursedirs", "tests_perf")
 
+    # Equivalent to norecursedirs = tests/tensor (conditional)
+    if dtype_config.skip_tensor_tests:
+        config.addinivalue_line("norecursedirs", "tests/tensor")
+
     # Register pytest markers
     config.addinivalue_line(
         "markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')"
diff --git a/dpnp/tests/tensor/__init__.py b/dpnp/tests/tensor/__init__.py
new file mode 100644
index 000000000000..b18d8ddc7dd1
--- /dev/null
+++ b/dpnp/tests/tensor/__init__.py
@@ -0,0 +1,31 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+__doc__ = r"""
+Test suite for tensor functionality migrated from dpctl.
+Running test suite requires Cython and a working compiler."""
diff --git a/dpnp/tests/tensor/conftest.py b/dpnp/tests/tensor/conftest.py
new file mode 100644
index 000000000000..ea10d1322e76
--- /dev/null
+++ b/dpnp/tests/tensor/conftest.py
@@ -0,0 +1,31 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+"""Configures pytest to discover helper/ module"""
+
+from dpnp.tests.conftest import suppress_invalid_numpy_warnings
diff --git a/dpnp/tests/tensor/elementwise/__init__.py b/dpnp/tests/tensor/elementwise/__init__.py
new file mode 100644
index 000000000000..a794242cd7bb
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/__init__.py
@@ -0,0 +1,32 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+"""
+Collection of test and utility files for testing elementwise operations
+over :class:`dpnp.tensor.usm_ndarray`.
+"""
diff --git a/dpnp/tests/tensor/elementwise/test_abs.py b/dpnp/tests/tensor/elementwise/test_abs.py
new file mode 100644
index 000000000000..535aebfb4d58
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_abs.py
@@ -0,0 +1,224 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+import warnings
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _complex_fp_dtypes,
+    _real_fp_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_abs_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
+    if np.issubdtype(arg_dt, np.complexfloating):
+        type_map = {
+            np.dtype("c8"): np.dtype("f4"),
+            np.dtype("c16"): np.dtype("f8"),
+        }
+        assert dpt.abs(X).dtype == type_map[arg_dt]
+
+        r = dpt.empty_like(X, dtype=type_map[arg_dt])
+        dpt.abs(X, out=r)
+        assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.abs(X)))
+    else:
+        assert dpt.abs(X).dtype == arg_dt
+
+        r = dpt.empty_like(X, dtype=arg_dt)
+        dpt.abs(X, out=r)
+        assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.abs(X)))
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_abs_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("i4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    Y = dpt.abs(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = dpt.asnumpy(X)
+    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+def test_abs_types_property():
+    get_queue_or_skip()
+    types = dpt.abs.types
+    assert isinstance(types, list)
+    assert len(types) > 0
+    assert types == dpt.abs.types_
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_abs_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    exp_dt = np.abs(np.ones(tuple(), dtype=arg_dt)).dtype
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.ones(U.shape, dtype=exp_dt)
+        expected_Y[..., 1::2] = 0
+        expected_Y = np.transpose(expected_Y, perms)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.abs(U, order=ord)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_abs_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    Xnp = np.random.standard_normal(
+        size=input_shape
+    ) + 1j * np.random.standard_normal(size=input_shape)
+    Xnp = Xnp.astype(arg_dt)
+    X[...] = Xnp
+
+    for ord in ["C", "F", "A", "K"]:
+        for perms in itertools.permutations(range(4)):
+            U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+            Y = dpt.abs(U, order=ord)
+            expected_Y = np.abs(np.transpose(Xnp[:, ::-1, ::-1, :], perms))
+            tol = dpt.finfo(Y.dtype).resolution
+            np.testing.assert_allclose(
+                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
+            )
+
+
+def test_abs_out_overlap():
+    get_queue_or_skip()
+
+    X = dpt.arange(-3, 3, 1, dtype="i4")
+    expected = dpt.asarray([3, 2, 1, 0, 1, 2], dtype="i4")
+    Y = dpt.abs(X, out=X)
+
+    assert Y is X
+    assert dpt.all(expected == X)
+
+    X = dpt.arange(-3, 3, 1, dtype="i4")
+    expected = expected[::-1]
+    Y = dpt.abs(X, out=X[::-1])
+    assert Y is not X
+    assert dpt.all(expected == X)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_abs_real_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+
+    x = dpt.asarray(inps_, dtype=dtype)
+    r = dpt.abs(x)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.abs(np.asarray(inps_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
+def test_abs_complex_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+    c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)]
+
+    z = dpt.asarray(c_, dtype=dtype)
+    r = dpt.abs(z)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.abs(np.asarray(c_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_abs_alignment(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.ones(512, dtype=dtype)
+    r = dpt.abs(x)
+
+    r2 = dpt.abs(x[1:])
+    assert np.allclose(dpt.asnumpy(r[1:]), dpt.asnumpy(r2))
+
+    dpt.abs(x[:-1], out=r[1:])
+    assert np.allclose(dpt.asnumpy(r[1:]), dpt.asnumpy(r2))
diff --git a/dpnp/tests/tensor/elementwise/test_add.py b/dpnp/tests/tensor/elementwise/test_add.py
new file mode 100644
index 000000000000..28a4efb21e94
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_add.py
@@ -0,0 +1,590 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+import re
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_add_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.add(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.add(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, 2, dtype=r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(ar1, dtype=r.dtype)
+    dpt.add(ar1, ar2, out=r2)
+    assert (dpt.asnumpy(r2) == np.full(r2.shape, 2, dtype=r2.dtype)).all()
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.add(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.add(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, 2, dtype=r.dtype)).all()
+
+    r2 = dpt.empty_like(ar1, dtype=r.dtype)
+    dpt.add(ar3[::-1], ar4[::2], out=r2)
+    assert (dpt.asnumpy(r2) == np.full(r2.shape, 2, dtype=r2.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_add_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.add(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_add_order():
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="C")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="C")
+        r1 = dpt.add(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.add(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.add(ar1, ar2, order="A")
+        assert r3.flags.c_contiguous
+        r4 = dpt.add(ar1, ar2, order="K")
+        assert r4.flags.c_contiguous
+
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="F")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="F")
+        r1 = dpt.add(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.add(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.add(ar1, ar2, order="A")
+        assert r3.flags.f_contiguous
+        r4 = dpt.add(ar1, ar2, order="K")
+        assert r4.flags.f_contiguous
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
+        r4 = dpt.add(ar1, ar2, order="K")
+        assert r4.strides == (n, -1)
+        r5 = dpt.add(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
+        r4 = dpt.add(ar1, ar2, order="K")
+        assert r4.strides == (-1, n)
+        r5 = dpt.add(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+
+def test_add_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    r = dpt.add(m, v)
+    assert (dpt.asnumpy(r) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
+
+    r2 = dpt.add(v, m)
+    assert (dpt.asnumpy(r2) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
+
+    r3 = dpt.empty_like(m)
+    dpt.add(m, v, out=r3)
+    assert (dpt.asnumpy(r3) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
+
+    r4 = dpt.empty_like(m)
+    dpt.add(v, m, out=r4)
+    assert (dpt.asnumpy(r4) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
+
+
+def test_add_broadcasting_new_shape():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((6, 1), dtype="i4")
+    ar2 = dpt.arange(6, dtype="i4")
+
+    r = dpt.add(ar1, ar2)
+    assert (dpt.asnumpy(r) == np.arange(1, 7, dtype="i4")[np.newaxis, :]).all()
+
+    r1 = dpt.add(ar2, ar1)
+    assert (dpt.asnumpy(r1) == np.arange(1, 7, dtype="i4")[np.newaxis, :]).all()
+
+    r2 = dpt.add(ar1[::2], ar2[::2])
+    assert (
+        dpt.asnumpy(r2) == np.arange(1, 7, dtype="i4")[::2][np.newaxis, :]
+    ).all()
+
+    r3 = dpt.empty_like(ar1)
+    with pytest.raises(ValueError):
+        dpt.add(ar1, ar2, out=r3)
+
+    ar3 = dpt.ones((6, 1), dtype="i4")
+    ar4 = dpt.ones((1, 6), dtype="i4")
+
+    r4 = dpt.add(ar3, ar4)
+    assert (dpt.asnumpy(r4) == np.full((6, 6), 2, dtype="i4")).all()
+
+    r5 = dpt.add(ar4, ar3)
+    assert (dpt.asnumpy(r5) == np.full((6, 6), 2, dtype="i4")).all()
+
+    r6 = dpt.add(ar3[::2], ar4[:, ::2])
+    assert (dpt.asnumpy(r6) == np.full((3, 3), 2, dtype="i4")).all()
+
+    r7 = dpt.add(ar3[::2], ar4)
+    assert (dpt.asnumpy(r7) == np.full((3, 6), 2, dtype="i4")).all()
+
+
+def test_add_broadcasting_error():
+    get_queue_or_skip()
+    m = dpt.ones((10, 10), dtype="i4")
+    v = dpt.ones((3,), dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.add(m, v)
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_add_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        R = dpt.add(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.add(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_add_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.add(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_add_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.add(a, c)
+
+
+def test_add_types_property():
+    get_queue_or_skip()
+    types = dpt.add.types
+    assert isinstance(types, list)
+    assert len(types) > 0
+    assert types == dpt.add.types_
+
+
+def test_add_errors():
+    get_queue_or_skip()
+    try:
+        gpu_queue = dpctl.SyclQueue("gpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('gpu') failed, skipping")
+    try:
+        cpu_queue = dpctl.SyclQueue("cpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('cpu') failed, skipping")
+
+    ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
+    ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
+    y = dpt.empty_like(ar1, sycl_queue=cpu_queue)
+    with pytest.raises(dpt.ExecutionPlacementError) as excinfo:
+        dpt.add(ar1, ar2, out=y)
+    assert "Input and output allocation queues are not compatible" in str(
+        excinfo.value
+    )
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = dpt.ones_like(ar1, dtype="int32")
+    y = dpt.empty(3)
+    with pytest.raises(ValueError) as excinfo:
+        dpt.add(ar1, ar2, out=y)
+    assert "The shape of input and output arrays are inconsistent" in str(
+        excinfo.value
+    )
+
+    ar1 = np.ones(2, dtype="float32")
+    ar2 = np.ones_like(ar1, dtype="int32")
+    with pytest.raises(dpt.ExecutionPlacementError) as excinfo:
+        dpt.add(ar1, ar2)
+    assert re.match(
+        "Execution placement can not be unambiguously inferred.*",
+        str(excinfo.value),
+    )
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = dpt.ones_like(ar1, dtype="int32")
+    y = np.empty(ar1.shape, dtype=ar1.dtype)
+    with pytest.raises(TypeError) as excinfo:
+        dpt.add(ar1, ar2, out=y)
+    assert "output array must be of usm_ndarray type" in str(excinfo.value)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_add_dtype_error(
+    dtype,
+):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    ar1 = dpt.ones(5, dtype=dtype)
+    ar2 = dpt.ones_like(ar1, dtype="f4")
+
+    y = dpt.zeros_like(ar1, dtype="int8")
+    with pytest.raises(ValueError) as excinfo:
+        dpt.add(ar1, ar2, out=y)
+    assert re.match("Output array of type.*is needed", str(excinfo.value))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_add_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X += int(0)
+    elif dt_kind == "f":
+        X += float(0)
+    elif dt_kind == "c":
+        X += complex(0)
+    elif dt_kind == "b":
+        X += bool(0)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_add_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    # operators use a different Python implementation which permits
+    # same kind style casting
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 += ar2
+        assert (
+            dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype=ar1.dtype)
+        ).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2]
+        ar3 += ar4
+        assert (
+            dpt.asnumpy(ar3) == np.full(ar3.shape, 2, dtype=ar3.dtype)
+        ).all()
+    else:
+        with pytest.raises(ValueError):
+            ar1 += ar2
+
+    # here, test the special case where out is the first argument
+    # so an in-place kernel is used for efficiency
+    # this covers a specific branch in the BinaryElementwiseFunc logic
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+        dpt.add(ar1, ar2, out=ar1)
+        assert (
+            dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype=ar1.dtype)
+        ).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2]
+        dpt.add(ar3, ar4, out=ar3)
+        assert (
+            dpt.asnumpy(ar3) == np.full(ar3.shape, 2, dtype=ar3.dtype)
+        ).all()
+    else:
+        with pytest.raises(ValueError):
+            dpt.add(ar1, ar2, out=ar1)
+
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.add(ar1, ar2, out=ar2)
+        assert (
+            dpt.asnumpy(ar2) == np.full(ar2.shape, 2, dtype=ar2.dtype)
+        ).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2]
+        dpt.add(ar3, ar4, out=ar4)
+        assert (
+            dpt.asnumpy(ar4) == np.full(ar4.shape, 2, dtype=ar4.dtype)
+        ).all()
+    else:
+        with pytest.raises(ValueError):
+            dpt.add(ar1, ar2, out=ar2)
+
+
+def test_add_inplace_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    dpt.add(m, v, out=m)
+    assert (dpt.asnumpy(m) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
+
+    # check case where second arg is out
+    dpt.add(v, m, out=m)
+    assert (
+        dpt.asnumpy(m) == np.arange(10, dtype="i4")[np.newaxis, 1:10:2]
+    ).all()
+
+
+def test_add_inplace_operator_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    m += v
+    assert (dpt.asnumpy(m) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
+
+
+def test_add_inplace_operator_mutual_broadcast():
+    get_queue_or_skip()
+
+    x1 = dpt.ones((1, 10), dtype="i4")
+    x2 = dpt.ones((10, 1), dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.add._inplace_op(x1, x2)
+
+
+def test_add_inplace_errors():
+    get_queue_or_skip()
+    try:
+        gpu_queue = dpctl.SyclQueue("gpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('gpu') failed, skipping")
+    try:
+        cpu_queue = dpctl.SyclQueue("cpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('cpu') failed, skipping")
+
+    ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
+    ar2 = dpt.ones_like(ar1, sycl_queue=cpu_queue)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.add(ar1, ar2, out=ar1)
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = dpt.ones(3, dtype="float32")
+    with pytest.raises(ValueError):
+        dpt.add(ar1, ar2, out=ar1)
+
+    ar1 = np.ones(2, dtype="float32")
+    ar2 = dpt.ones(2, dtype="float32")
+    with pytest.raises(TypeError):
+        dpt.add(ar1, ar2, out=ar1)
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = {}
+    with pytest.raises(ValueError):
+        dpt.add(ar1, ar2, out=ar1)
+
+    ar1 = dpt.ones((2, 1), dtype="float32")
+    ar2 = dpt.ones((1, 2), dtype="float32")
+    with pytest.raises(ValueError):
+        dpt.add(ar1, ar2, out=ar1)
+
+
+def test_add_inplace_operator_errors():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4", sycl_queue=q1)
+    with pytest.raises(TypeError):
+        dpt.add._inplace_op(dict(), x)
+
+    x.flags["W"] = False
+    with pytest.raises(ValueError):
+        dpt.add._inplace_op(x, 2)
+
+    x_q1 = dpt.ones(10, dtype="i4", sycl_queue=q1)
+    x_q2 = dpt.ones(10, dtype="i4", sycl_queue=q2)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.add._inplace_op(x_q1, x_q2)
+
+
+def test_add_inplace_same_tensors():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones(10, dtype="i4")
+    ar1 += ar1
+    assert (dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype="i4")).all()
+
+    ar1 = dpt.ones(10, dtype="i4")
+    ar2 = dpt.ones(10, dtype="i4")
+    dpt.add(ar1, ar2, out=ar1)
+    # all ar1 vals should be 2
+    assert (dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype="i4")).all()
+
+    dpt.add(ar2, ar1, out=ar2)
+    # all ar2 vals should be 3
+    assert (dpt.asnumpy(ar2) == np.full(ar2.shape, 3, dtype="i4")).all()
+
+    dpt.add(ar1, ar2, out=ar2)
+    # all ar2 vals should be 5
+    assert (dpt.asnumpy(ar2) == np.full(ar2.shape, 5, dtype="i4")).all()
+
+
+def test_add_str_repr():
+    add_s = str(dpt.add)
+    assert isinstance(add_s, str)
+    assert "add" in add_s
+
+    add_r = repr(dpt.add)
+    assert isinstance(add_r, str)
+    assert "add" in add_r
+
+
+def test_add_cfd():
+    q1 = get_queue_or_skip()
+    q2 = dpctl.SyclQueue(q1.sycl_device)
+
+    x1 = dpt.ones(10, sycl_queue=q1)
+    x2 = dpt.ones(10, sycl_queue=q2)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.add(x1, x2)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.add(x1, x1, out=x2)
+
+
+def test_add_out_type_check():
+    get_queue_or_skip()
+
+    x1 = dpt.ones(10)
+    x2 = dpt.ones(10)
+
+    out = range(10)
+
+    with pytest.raises(TypeError):
+        dpt.add(x1, x2, out=out)
+
+
+def test_add_out_need_temporary():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="u4")
+
+    dpt.add(x[:6], 1, out=x[-6:])
+
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
diff --git a/dpnp/tests/tensor/elementwise/test_angle.py b/dpnp/tests/tensor/elementwise/test_angle.py
new file mode 100644
index 000000000000..09dc2bfc414f
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_angle.py
@@ -0,0 +1,111 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _complex_fp_dtypes,
+    _no_complex_dtypes,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_angle_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    dt = dpt.dtype(dtype)
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(dt, dpt.complex64, _fp16, _fp64):
+        assert dpt.angle(x).dtype == dpt.float32
+    else:
+        assert dpt.angle(x).dtype == dpt.float64
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:])
+def test_angle_real(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.arange(10, dtype=dtype, sycl_queue=q)
+    r = dpt.angle(x)
+
+    assert dpt.all(r == 0)
+
+
+@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
+def test_angle_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    vals = dpt.pi * dpt.arange(10, dtype=dpt.finfo(dtype).dtype, sycl_queue=q)
+
+    x = dpt.zeros(10, dtype=dtype, sycl_queue=q)
+
+    x.imag[...] = vals
+    r = dpt.angle(x)
+    expected = dpt.atan2(x.imag, x.real)
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol)
+
+    x.real[...] += dpt.pi
+    r = dpt.angle(x)
+    expected = dpt.atan2(x.imag, x.real)
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_angle_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    vals = [np.nan, -np.nan, np.inf, -np.inf, +0.0, -0.0]
+    vals = [complex(*val) for val in itertools.product(vals, repeat=2)]
+
+    x = dpt.asarray(vals, dtype=dtype, sycl_queue=q)
+
+    r = dpt.angle(x)
+    expected = dpt.atan2(x.imag, x.real)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpnp/tests/tensor/elementwise/test_atan2.py b/dpnp/tests/tensor/elementwise/test_atan2.py
new file mode 100644
index 000000000000..7a7bb92cdd7b
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_atan2.py
@@ -0,0 +1,524 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _no_complex_dtypes,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_atan2_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.atan2(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.arctan2(
+        np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+
+    tol = 8 * max(
+        dpt.finfo(r.dtype).resolution, dpt.finfo(expected.dtype).resolution
+    )
+    assert_allclose(dpt.asnumpy(r), expected, atol=tol, rtol=tol)
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.atan2(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.arctan2(
+        np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+
+    tol = 8 * max(
+        dpt.finfo(r.dtype).resolution, dpt.finfo(expected.dtype).resolution
+    )
+    assert_allclose(dpt.asnumpy(r), expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
+def test_atan2_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.atan2(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.atan2(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_one_nan(dt):
+    """If either x1_i or x2_i is NaN, the result is NaN."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([dpt.nan, dpt.nan, 1], dtype=dt)
+    x2 = dpt.asarray([dpt.nan, 1, dpt.nan], dtype=dt)
+
+    y = dpt.atan2(x1, x2)
+    assert dpt.all(dpt.isnan(y))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_positive_and_pzero(dt):
+    """If x1_i is greater than 0 and x2_i is +0, the result
+    is an approximation to +pi/2.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt)
+    x2 = dpt.asarray([+0.0], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi / 2, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_positive_and_nzero(dt):
+    """If x1_i is greater than 0 and x2_i is -0, the result
+    is an approximation to +pi/2.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt)
+    x2 = dpt.asarray([-0.0], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi / 2, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pzero_and_positive(dt):
+    """If x1_i is +0 and x2_i is greater than 0,
+    the result is +0.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(+0.0, dtype=dt)
+    x2 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(+0.0, dtype=dt)
+
+    assert dpt.all(dpt.equal(actual, expected))
+    assert not dpt.any(dpt.signbit(actual))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pzero_and_pzero(dt):
+    """If x1_i is +0 and x2_i is +0, the result is +0."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(+0.0, dtype=dt)
+    x2 = dpt.asarray([+0.0], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(+0.0, dtype=dt)
+
+    assert dpt.all(dpt.equal(actual, expected))
+    assert not dpt.any(dpt.signbit(actual))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pzero_and_nzero(dt):
+    """
+    If x1_i is +0 and x2_i is -0, the result is an
+    approximation to +pi.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(+0.0, dtype=dt)
+    x2 = dpt.asarray([-0.0], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pzero_and_negatvie(dt):
+    """
+    If x1_i is +0 and x2_i is less than 0, the result
+    is an approximation to +pi.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(+0.0, dtype=dt)
+    x2 = dpt.asarray([-0.5, -1, -2, -dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_nzero_and_positive(dt):
+    """If x1_i is -0 and x2_i is greater than 0,
+    the result is -0.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(-0.0, dtype=dt)
+    x2 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-0.0, dtype=dt)
+
+    assert dpt.all(dpt.equal(actual, expected))
+    assert dpt.all(dpt.signbit(actual))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_nzero_and_pzero(dt):
+    """If x1_i is -0 and x2_i is +0, the result is -0."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(-0.0, dtype=dt)
+    x2 = dpt.asarray([+0.0], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-0.0, dtype=dt)
+
+    assert dpt.all(dpt.equal(actual, expected))
+    assert dpt.all(dpt.signbit(actual))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_nzero_and_nzero(dt):
+    """If x1_i is -0 and x2_i is -0, the result is
+    an approximation to -pi.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-0.0], dtype=dt)
+    x2 = dpt.asarray([-0.0], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_nzero_and_negative(dt):
+    """If x1_i is -0 and x2_i is less than 0, the result
+    is an approximation to -pi.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-0.0], dtype=dt)
+    x2 = dpt.asarray([-dpt.inf, -2, -1, -0.5], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_negative_and_pzero(dt):
+    """If x1_i is less than 0 and x2_i is +0, the result
+    is an approximation to -pi/2.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-dpt.inf, -2, -1, -0.5], dtype=dt)
+    x2 = dpt.asarray(+0.0, dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi / 2, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_negative_and_nzero(dt):
+    """If x1_i is less than 0 and x2_i is -0, the result
+    is an approximation to -pi/2."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-dpt.inf, -2, -1, -0.5], dtype=dt)
+    x2 = dpt.asarray(-0.0, dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi / 2, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pfinite_and_pinf(dt):
+    """If x1_i is greater than 0, x1_i is a finite number,
+    and x2_i is +infinity, the result is +0."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([0.5, 1, 2, 5], dtype=dt)
+    x2 = dpt.asarray(dpt.inf, dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(+0.0, dtype=dt)
+    assert dpt.all(dpt.equal(actual, expected))
+    assert not dpt.any(dpt.signbit(actual))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pfinite_and_ninf(dt):
+    """If x1_i is greater than 0, x1_i is a finite number,
+    and x2_i is -infinity, the result is an approximation
+    to +pi."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([0.5, 1, 2, 5], dtype=dt)
+    x2 = dpt.asarray(-dpt.inf, dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_nfinite_and_pinf(dt):
+    """If x1_i is less than 0, x1_i is a finite number,
+    and x2_i is +infinity, the result is -0."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-0.5, -1, -2, -5], dtype=dt)
+    x2 = dpt.asarray(dpt.inf, dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-0.0, dtype=dt)
+    assert dpt.all(dpt.equal(actual, expected))
+    assert dpt.all(dpt.signbit(actual))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_nfinite_and_ninf(dt):
+    """If x1_i is less than 0, x1_i is a finite number, and
+    x2_i is -infinity, the result is an approximation
+    to -pi."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-0.5, -1, -2, -5], dtype=dt)
+    x2 = dpt.asarray(-dpt.inf, dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pinf_and_finite(dt):
+    """If x1_i is +infinity and x2_i is a finite number,
+    the result is an approximation to +pi/2.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(dpt.inf, dtype=dt)
+    x2 = dpt.asarray([-2, -0.0, 0.0, 2], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi / 2, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_ninf_and_finite(dt):
+    """If x1_i is -infinity and x2_i is a finite number,
+    the result is an approximation to -pi/2.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(-dpt.inf, dtype=dt)
+    x2 = dpt.asarray([-2, -0.0, 0.0, 2], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi / 2, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pinf_and_pinf(dt):
+    """If x1_i is +infinity and x2_i is +infinity,
+    the result is an approximation to +pi/4.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(dpt.inf, dtype=dt)
+    x2 = dpt.asarray([dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(dpt.pi / 4, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_pinf_and_ninf(dt):
+    """If x1_i is +infinity and x2_i is -infinity,
+    the result is an approximation to +3*pi/4.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(dpt.inf, dtype=dt)
+    x2 = dpt.asarray([-dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(3 * dpt.pi / 4, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_ninf_and_pinf(dt):
+    """If x1_i is -infinity and x2_i is +infinity,
+    the result is an approximation to -pi/4.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(-dpt.inf, dtype=dt)
+    x2 = dpt.asarray([dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-dpt.pi / 4, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_atan2_special_case_ninf_and_ninf(dt):
+    """If x1_i is -infinity and x2_i is -infinity,
+    the result is an approximation to -3*pi/4.
+    """
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray(-dpt.inf, dtype=dt)
+    x2 = dpt.asarray([-dpt.inf], dtype=dt)
+
+    actual = dpt.atan2(x1, x2)
+    expected = dpt.asarray(-3 * dpt.pi / 4, dtype=dt)
+
+    diff = dpt.abs(dpt.subtract(actual, expected))
+    atol = 8 * dpt.finfo(diff.dtype).eps
+    assert dpt.all(dpt.less_equal(diff, atol))
diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_and.py b/dpnp/tests/tensor/elementwise/test_bitwise_and.py
new file mode 100644
index 000000000000..c9172cb9d7d6
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_bitwise_and.py
@@ -0,0 +1,142 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_and_dtype_matrix_contig(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+
+    x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)
+
+    r = dpt.bitwise_and(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)
+    r_np = np.bitwise_and(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_and_dtype_matrix_strided(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2]
+
+    x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2]
+
+    r = dpt.bitwise_and(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2]
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2]
+    r_np = np.bitwise_and(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+def test_bitwise_and_bool():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([True, False])
+    x2 = dpt.asarray([False, True])
+
+    r_bw = dpt.bitwise_and(x1[:, dpt.newaxis], x2[dpt.newaxis])
+    r_lo = dpt.logical_and(x1[:, dpt.newaxis], x2[dpt.newaxis])
+
+    assert dpt.all(dpt.equal(r_bw, r_lo))
+
+
+@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes)
+def test_bitwise_and_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind == "b":
+        X &= False
+    else:
+        X &= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes)
+def test_bitwise_and_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 &= ar2
+        assert dpt.all(ar1 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 &= ar4
+        assert dpt.all(ar3 == 1)
+    else:
+        with pytest.raises(ValueError):
+            ar1 &= ar2
diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_invert.py b/dpnp/tests/tensor/elementwise/test_bitwise_invert.py
new file mode 100644
index 000000000000..2b7a7c3a6f93
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_bitwise_invert.py
@@ -0,0 +1,148 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _integral_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize(
+    "op_dtype",
+    [
+        "b1",
+    ]
+    + _integral_dtypes,
+)
+def test_bitwise_invert_dtype_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    ar1 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op_dtype)
+
+    r = dpt.bitwise_invert(ar1)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == ar1.dtype
+
+    expected = np.bitwise_not(dpt.asnumpy(ar1))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.bitwise_invert(ar1, out=r2)
+    assert dpt.all(dpt.equal(r, r2))
+
+    ar2 = dpt.zeros(sz, dtype=op_dtype)
+    r = dpt.bitwise_invert(ar2[::-1])
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.bitwise_not(np.zeros(ar2.shape, dtype=op_dtype))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar2.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    ar3 = dpt.ones(sz, dtype=op_dtype)
+    r2 = dpt.bitwise_invert(ar3[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.bitwise_not(np.ones(ar3.shape, dtype=op_dtype)[::2])
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert (dpt.asnumpy(r2) == expected).all()
+
+    r3 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.bitwise_invert(ar2[::-1], out=r3)
+    assert dpt.all(dpt.equal(r, r3))
+
+
+@pytest.mark.parametrize("op_usm_type", _usm_types)
+def test_bitwise_invert_usm_type_matrix(op_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype="i4", usm_type=op_usm_type
+    )
+
+    r = dpt.bitwise_invert(ar1)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.usm_type == op_usm_type
+
+
+def test_bitwise_invert_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.bitwise_invert(ar1, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.bitwise_invert(ar1, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.bitwise_invert(ar1, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.bitwise_invert(ar1, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.zeros((20, 20), dtype="i4", order="F")
+    r1 = dpt.bitwise_invert(ar1, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.bitwise_invert(ar1, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.bitwise_invert(ar1, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.bitwise_invert(ar1, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.bitwise_invert(ar1, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.zeros((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.bitwise_invert(ar1, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_bitwise_invert_large_boolean():
+    get_queue_or_skip()
+
+    x = dpt.tril(dpt.ones((32, 32), dtype="?"), k=-1)
+    res = dpt.astype(dpt.bitwise_invert(x), "i4")
+
+    assert dpt.all(res >= 0)
+    assert dpt.all(res <= 1)
diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_left_shift.py b/dpnp/tests/tensor/elementwise/test_bitwise_left_shift.py
new file mode 100644
index 000000000000..bb68aab227ab
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_bitwise_left_shift.py
@@ -0,0 +1,150 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_left_shift_dtype_matrix_contig(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
+        return
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op1_dtype)
+    dt2 = dpt.dtype(op2_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+    x2 = dpt.arange(0, n, dtype=dt2)
+
+    r = dpt.bitwise_left_shift(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.sycl_queue == x1.sycl_queue
+    assert r.sycl_queue == x2.sycl_queue
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op1_dtype)
+    x2_np = np.arange(0, n, dtype=op2_dtype)
+    r_np = np.left_shift(x1_np, x2_np)
+
+    assert r.dtype == r_np.dtype
+    assert (dpt.asnumpy(r) == r_np).all()
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_left_shift_dtype_matrix_strided(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
+        return
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op1_dtype)
+    dt2 = dpt.dtype(op2_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
+    x2 = dpt.arange(0, n, dtype=dt2)[::2]
+
+    r = dpt.bitwise_left_shift(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.sycl_queue == x1.sycl_queue
+    assert r.sycl_queue == x2.sycl_queue
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
+    x2_np = np.arange(0, n, dtype=dt2)[::2]
+    r_np = np.left_shift(x1_np, x2_np)
+
+    assert r.dtype == r_np.dtype
+    assert (dpt.asnumpy(r) == r_np).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_left_shift_range(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    x = dpt.ones(255, dtype=op_dtype)
+    y = dpt.asarray(64, dtype=op_dtype)
+
+    z = dpt.bitwise_left_shift(x, y)
+    assert dpt.all(dpt.equal(z, 0))
+
+
+@pytest.mark.parametrize("dtype", _integral_dtypes)
+def test_bitwise_left_shift_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    X <<= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_left_shift_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 <<= ar2
+        assert dpt.all(ar1 == 2)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 <<= ar4
+        assert dpt.all(ar3 == 2)
+    else:
+        with pytest.raises(ValueError):
+            ar1 <<= ar2
diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_or.py b/dpnp/tests/tensor/elementwise/test_bitwise_or.py
new file mode 100644
index 000000000000..0e1a5bfeab1c
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_bitwise_or.py
@@ -0,0 +1,158 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_or_dtype_matrix_contig(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+
+    x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)
+
+    r = dpt.bitwise_or(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)
+    r_np = np.bitwise_or(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_or_dtype_matrix_strided(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2]
+
+    x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2]
+
+    r = dpt.bitwise_or(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2]
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2]
+    r_np = np.bitwise_or(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+def test_bitwise_or_bool():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([True, False])
+    x2 = dpt.asarray([False, True])
+
+    r_bw = dpt.bitwise_or(x1[:, dpt.newaxis], x2[dpt.newaxis])
+    r_lo = dpt.logical_or(x1[:, dpt.newaxis], x2[dpt.newaxis])
+
+    assert dpt.all(dpt.equal(r_bw, r_lo))
+
+
+@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes)
+def test_bitwise_or_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind == "b":
+        X |= False
+    else:
+        X |= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes)
+def test_bitwise_or_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 |= ar2
+        assert dpt.all(ar1 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 |= ar4
+        assert dpt.all(ar3 == 1)
+    else:
+        with pytest.raises(ValueError):
+            ar1 |= ar2
+            dpt.bitwise_or(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.bitwise_or(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.bitwise_or(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 1)
+    else:
+        with pytest.raises(ValueError):
+            dpt.bitwise_or(ar1, ar2, out=ar2)
diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_right_shift.py b/dpnp/tests/tensor/elementwise/test_bitwise_right_shift.py
new file mode 100644
index 000000000000..cdd2da9ba863
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_bitwise_right_shift.py
@@ -0,0 +1,166 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_right_shift_dtype_matrix_contig(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
+        return
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op1_dtype)
+    dt2 = dpt.dtype(op2_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+    x2 = dpt.arange(0, n, dtype=dt2)
+
+    r = dpt.bitwise_right_shift(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.sycl_queue == x1.sycl_queue
+    assert r.sycl_queue == x2.sycl_queue
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op1_dtype)
+    x2_np = np.arange(0, n, dtype=op2_dtype)
+    r_np = np.right_shift(x1_np, x2_np)
+
+    assert r.dtype == r_np.dtype
+    assert (dpt.asnumpy(r) == r_np).all()
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_right_shift_dtype_matrix_strided(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
+        return
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op1_dtype)
+    dt2 = dpt.dtype(op2_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
+    x2 = dpt.arange(0, n, dtype=dt2)[::2]
+
+    r = dpt.bitwise_right_shift(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.sycl_queue == x1.sycl_queue
+    assert r.sycl_queue == x2.sycl_queue
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
+    x2_np = np.arange(0, n, dtype=dt2)[::2]
+    r_np = np.right_shift(x1_np, x2_np)
+
+    assert r.dtype == r_np.dtype
+    assert (dpt.asnumpy(r) == r_np).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_right_shift_range(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    x = dpt.ones(255, dtype=op_dtype)
+    y = dpt.asarray(64, dtype=op_dtype)
+
+    z = dpt.bitwise_right_shift(x, y)
+    assert dpt.all(dpt.equal(z, 0))
+
+
+@pytest.mark.parametrize("dtype", _integral_dtypes)
+def test_bitwise_right_shift_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    X >>= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
+def test_bitwise_right_shift_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
+        ar1 >>= ar2
+        assert dpt.all(ar1 == 0)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 >>= ar4
+        assert dpt.all(ar3 == 0)
+    else:
+        with pytest.raises(ValueError):
+            ar1 >>= ar2
+            dpt.bitwise_right_shift(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.bitwise_right_shift(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 0)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.bitwise_right_shift(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 0)
+    else:
+        with pytest.raises(ValueError):
+            dpt.bitwise_right_shift(ar1, ar2, out=ar2)
diff --git a/dpnp/tests/tensor/elementwise/test_bitwise_xor.py b/dpnp/tests/tensor/elementwise/test_bitwise_xor.py
new file mode 100644
index 000000000000..60bc2c518e26
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_bitwise_xor.py
@@ -0,0 +1,158 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _integral_dtypes
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_xor_dtype_matrix_contig(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
+
+    x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)
+
+    r = dpt.bitwise_xor(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)
+    r_np = np.bitwise_xor(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+@pytest.mark.parametrize("op_dtype", _integral_dtypes)
+def test_bitwise_xor_dtype_matrix_strided(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 11
+    n = 2 * sz
+    dt1 = dpt.dtype(op_dtype)
+    dt2 = dpt.dtype(op_dtype)
+
+    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
+    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2]
+
+    x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0
+    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2]
+
+    r = dpt.bitwise_xor(x1, x2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2]
+    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2]
+    r_np = np.bitwise_xor(x1_np, x2_np)
+
+    assert (r_np == dpt.asnumpy(r)).all()
+
+
+def test_bitwise_xor_bool():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([True, False])
+    x2 = dpt.asarray([False, True])
+
+    r_bw = dpt.bitwise_xor(x1[:, dpt.newaxis], x2[dpt.newaxis])
+    r_lo = dpt.logical_xor(x1[:, dpt.newaxis], x2[dpt.newaxis])
+
+    assert dpt.all(dpt.equal(r_bw, r_lo))
+
+
+@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes)
+def test_bitwise_xor_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind == "b":
+        X ^= False
+    else:
+        X ^= int(0)
+
+
+@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes)
+@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes)
+def test_bitwise_xor_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 ^= ar2
+        assert dpt.all(ar1 == 0)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 ^= ar4
+        assert dpt.all(ar3 == 0)
+    else:
+        with pytest.raises(ValueError):
+            ar1 ^= ar2
+            dpt.bitwise_xor(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.bitwise_xor(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 0)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.bitwise_xor(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 0)
+    else:
+        with pytest.raises(ValueError):
+            dpt.bitwise_xor(ar1, ar2, out=ar2)
diff --git a/dpnp/tests/tensor/elementwise/test_cbrt.py b/dpnp/tests/tensor/elementwise/test_cbrt.py
new file mode 100644
index 000000000000..8c063d3fbdec
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_cbrt.py
@@ -0,0 +1,98 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _map_to_device_dtype,
+    _no_complex_dtypes,
+    _real_fp_dtypes,
+)
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_cbrt_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.cbrt(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.cbrt(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_cbrt_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.cbrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_cbrt_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.cbrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
+def test_cbrt_special_cases():
+    get_queue_or_skip()
+
+    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = dpt.cbrt(X)
+    expected = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    tol = dpt.finfo(dpt.float32).resolution
+
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpnp/tests/tensor/elementwise/test_complex.py b/dpnp/tests/tensor/elementwise/test_complex.py
new file mode 100644
index 000000000000..2a006a7c519a
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_complex.py
@@ -0,0 +1,243 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+import warnings
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_complex_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.real(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.real(X).dtype == expected_dtype
+
+    expected_dtype = np.imag(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.imag(X).dtype == expected_dtype
+
+    expected_dtype = np.conj(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.conj(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize(
+    "np_call, dpt_call",
+    [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)],
+)
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_complex_output(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+
+    x1 = np.linspace(0, 10, num=n_seq, dtype=dtype)
+    x2 = np.linspace(0, 20, num=n_seq, dtype=dtype)
+    Xnp = x1 + 1j * x2
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Y = dpt_call(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np_call(Xnp), atol=tol, rtol=tol)
+
+    Z = dpt.empty_like(X, dtype=Y.dtype)
+    dpt_call(X, out=Z)
+
+    assert_allclose(dpt.asnumpy(Z), np_call(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize(
+    "np_call, dpt_call",
+    [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)],
+)
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_complex_usm_type(np_call, dpt_call, usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("c8")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = np.pi / 6 + 1j * np.pi / 3
+    X[..., 1::2] = np.pi / 3 + 1j * np.pi / 6
+
+    Y = dpt_call(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    X_np = np.empty(input_shape, dtype=arg_dt)
+    X_np[..., 0::2] = np.complex64(np.pi / 6 + 1j * np.pi / 3)
+    X_np[..., 1::2] = np.complex64(np.pi / 3 + 1j * np.pi / 6)
+
+    expected_Y = np_call(X_np)
+
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize(
+    "np_call, dpt_call",
+    [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)],
+)
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_complex_order(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = np.pi / 6 + 1j * np.pi / 3
+    X[..., 1::2] = np.pi / 3 + 1j * np.pi / 6
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np_call(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt_call(U, order=ord)
+            assert_allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_projection_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = [
+        complex(1, 2),
+        complex(dpt.inf, -1),
+        complex(0, -dpt.inf),
+        complex(-dpt.inf, dpt.nan),
+    ]
+    Y = [
+        complex(1, 2),
+        complex(np.inf, -0.0),
+        complex(np.inf, -0.0),
+        complex(np.inf, 0.0),
+    ]
+
+    Xf = dpt.asarray(X, dtype=dtype, sycl_queue=q)
+    Yf = np.array(Y, dtype=dtype)
+
+    tol = 8 * dpt.finfo(Xf.dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.proj(Xf)), Yf, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_projection(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    Xf = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    out_dtype = dpt.proj(Xf).dtype
+    Yf = np.array(complex(1, 0), dtype=out_dtype)
+
+    tol = 8 * dpt.finfo(Yf.dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.proj(Xf)), Yf, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize(
+    "np_call, dpt_call",
+    [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)],
+)
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_complex_strided(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    low = -1000.0
+    high = 1000.0
+    for ii in sizes:
+        x1 = np.random.uniform(low=low, high=high, size=ii)
+        x2 = np.random.uniform(low=low, high=high, size=ii)
+        Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np_call(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt_call(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_complex_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, -np.nan, np.inf, -np.inf, +0.0, -0.0]
+    xc = [complex(*val) for val in itertools.product(x, repeat=2)]
+
+    Xc_np = np.array(xc, dtype=dtype)
+    Xc = dpt.asarray(Xc_np, dtype=dtype, sycl_queue=q)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    actual = dpt.real(Xc)
+    expected = np.real(Xc_np)
+    assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol)
+
+    actual = dpt.imag(Xc)
+    expected = np.imag(Xc_np)
+    assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol)
+
+    actual = dpt.conj(Xc)
+    expected = np.conj(Xc_np)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_copysign.py b/dpnp/tests/tensor/elementwise/test_copysign.py
new file mode 100644
index 000000000000..f9ec5345d257
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_copysign.py
@@ -0,0 +1,130 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _no_complex_dtypes,
+    _real_fp_dtypes,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
+def test_copysign_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.copysign(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.copysign(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.copysign(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.copysign(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _real_fp_dtypes)
+def test_copysign_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.copysign(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.copysign(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dt", _real_fp_dtypes)
+def test_copysign(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.arange(100, dtype=dt, sycl_queue=q)
+    x[1::2] *= -1
+    y = dpt.ones(100, dtype=dt, sycl_queue=q)
+    y[::2] *= -1
+    res = dpt.copysign(x, y)
+    expected = dpt.negative(x)
+    tol = dpt.finfo(dt).resolution
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol)
+
+
+def test_copysign_special_values():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([1.0, 0.0, dpt.nan, dpt.nan], dtype="f4")
+    y1 = dpt.asarray([-1.0, -0.0, -dpt.nan, -1], dtype="f4")
+    res = dpt.copysign(x1, y1)
+    assert dpt.all(dpt.signbit(res))
+    x2 = dpt.asarray([-1.0, -0.0, -dpt.nan, -dpt.nan], dtype="f4")
+    res = dpt.copysign(x2, y1)
+    assert dpt.all(dpt.signbit(res))
+    y2 = dpt.asarray([0.0, 1.0, dpt.nan, 1.0], dtype="f4")
+    res = dpt.copysign(x2, y2)
+    assert not dpt.any(dpt.signbit(res))
+    res = dpt.copysign(x1, y2)
+    assert not dpt.any(dpt.signbit(res))
diff --git a/dpnp/tests/tensor/elementwise/test_divide.py b/dpnp/tests/tensor/elementwise/test_divide.py
new file mode 100644
index 000000000000..99de5a51214d
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_divide.py
@@ -0,0 +1,311 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+from dpctl.utils import SequentialOrderManager
+
+import dpnp.tensor as dpt
+from dpnp.tensor._tensor_elementwise_impl import _divide_by_scalar
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _complex_fp_dtypes,
+    _real_fp_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_divide_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.divide(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.divide(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.divide(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.divide(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_divide_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.divide(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_divide_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.divide(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.divide(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.divide(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.divide(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.divide(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.divide(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.divide(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.divide(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.divide(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.divide(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_divide_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.divide(m, v)
+
+    expected = np.divide(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.divide(v, m)
+    expected2 = np.divide(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_divide_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.divide(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.divide(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_divide_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.divide(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_divide_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.divide(a, c)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes + _complex_fp_dtypes)
+def test_divide_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind == "f":
+        X /= float(1)
+    elif dt_kind == "c":
+        X /= complex(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_divide_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    # out array only valid if it is inexact
+    if (
+        _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind")
+        and dpt.dtype(op1_dtype).kind in "fc"
+    ):
+        ar1 /= ar2
+        assert dpt.all(ar1 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 /= ar4
+        assert dpt.all(ar3 == 1)
+    else:
+        with pytest.raises(ValueError):
+            ar1 /= ar2
+            dpt.divide(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+    if (
+        _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64)
+        and dpt.dtype(op2_dtype).kind in "fc"
+    ):
+        dpt.divide(ar1, ar2, out=ar2)
+        assert dpt.all(ar2 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        dpt.divide(ar3, ar4, out=ar4)
+        dpt.all(ar4 == 1)
+    else:
+        with pytest.raises(ValueError):
+            dpt.divide(ar1, ar2, out=ar2)
+
+
+def test_divide_gh_1711():
+    "See https://github.com/IntelPython/dpctl/issues/1711"
+    get_queue_or_skip()
+
+    res = dpt.divide(-4, dpt.asarray(1, dtype="u4"))
+    assert isinstance(res, dpt.usm_ndarray)
+    assert res.dtype.kind == "f"
+    assert dpt.allclose(res, -4 / dpt.asarray(1, dtype="i4"))
+
+    res = dpt.divide(dpt.asarray(3, dtype="u4"), -2)
+    assert isinstance(res, dpt.usm_ndarray)
+    assert res.dtype.kind == "f"
+    assert dpt.allclose(res, dpt.asarray(3, dtype="i4") / -2)
+
+
+# don't test for overflowing double as Python won't cast
+# a Python integer of that size to a Python float
+@pytest.mark.parametrize("fp_dt", [dpt.float16, dpt.float32])
+def test_divide_by_scalar_overflow(fp_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(fp_dt, q)
+
+    x = dpt.ones(10, dtype=fp_dt, sycl_queue=q)
+    out = dpt.empty_like(x)
+
+    max_exp = np.finfo(fp_dt).maxexp
+    sca = 2**max_exp
+
+    _manager = SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    _, ev = _divide_by_scalar(
+        src=x, scalar=sca, dst=out, sycl_queue=q, depends=dep_evs
+    )
+    ev.wait()
+
+    assert dpt.all(out == 0)
diff --git a/dpnp/tests/tensor/elementwise/test_elementwise_classes.py b/dpnp/tests/tensor/elementwise/test_elementwise_classes.py
new file mode 100644
index 000000000000..04b92937f371
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_elementwise_classes.py
@@ -0,0 +1,150 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import get_queue_or_skip
+
+unary_fn = dpt.negative
+binary_fn = dpt.divide
+
+
+def test_unary_class_getters():
+    fn = unary_fn.get_implementation_function()
+    assert callable(fn)
+
+    fn = unary_fn.get_type_result_resolver_function()
+    assert callable(fn)
+
+
+def test_unary_class_types_property():
+    get_queue_or_skip()
+    loop_types = unary_fn.types
+    assert isinstance(loop_types, list)
+    assert len(loop_types) > 0
+    assert all(isinstance(sig, str) for sig in loop_types)
+    assert all("->" in sig for sig in loop_types)
+
+
+def test_unary_class_str_repr():
+    s = str(unary_fn)
+    r = repr(unary_fn)
+
+    assert isinstance(s, str)
+    assert isinstance(r, str)
+    kl_n = unary_fn.__name__
+    assert kl_n in s
+    assert kl_n in r
+
+
+def test_unary_read_only_out():
+    get_queue_or_skip()
+    x = dpt.arange(32, dtype=dpt.int32)
+    r = dpt.empty_like(x)
+    r.flags["W"] = False
+    with pytest.raises(ValueError):
+        unary_fn(x, out=r)
+
+
+def test_binary_class_getters():
+    fn = binary_fn.get_implementation_function()
+    assert callable(fn)
+
+    fn = binary_fn.get_implementation_inplace_function()
+    assert callable(fn)
+
+    fn = binary_fn.get_type_result_resolver_function()
+    assert callable(fn)
+
+    fn = binary_fn.get_type_promotion_path_acceptance_function()
+    assert callable(fn)
+
+
+def test_binary_class_types_property():
+    get_queue_or_skip()
+    loop_types = binary_fn.types
+    assert isinstance(loop_types, list)
+    assert len(loop_types) > 0
+    assert all(isinstance(sig, str) for sig in loop_types)
+    assert all("->" in sig for sig in loop_types)
+
+
+def test_binary_class_str_repr():
+    s = str(binary_fn)
+    r = repr(binary_fn)
+
+    assert isinstance(s, str)
+    assert isinstance(r, str)
+    kl_n = binary_fn.__name__
+    assert kl_n in s
+    assert kl_n in r
+
+
+def test_unary_class_nin():
+    nin = unary_fn.nin
+    assert isinstance(nin, int)
+    assert nin == 1
+
+
+def test_binary_class_nin():
+    nin = binary_fn.nin
+    assert isinstance(nin, int)
+    assert nin == 2
+
+
+def test_unary_class_nout():
+    nout = unary_fn.nout
+    assert isinstance(nout, int)
+    assert nout == 1
+
+
+def test_binary_class_nout():
+    nout = binary_fn.nout
+    assert isinstance(nout, int)
+    assert nout == 1
+
+
+def test_binary_read_only_out():
+    get_queue_or_skip()
+    x1 = dpt.ones(32, dtype=dpt.float32)
+    x2 = dpt.ones_like(x1)
+    r = dpt.empty_like(x1)
+    r.flags["W"] = False
+    with pytest.raises(ValueError):
+        binary_fn(x1, x2, out=r)
+
+
+def test_binary_no_inplace_op():
+    get_queue_or_skip()
+    x1 = dpt.ones(10, dtype="i4")
+    x2 = dpt.ones_like(x1)
+
+    with pytest.raises(ValueError):
+        dpt.logaddexp._inplace_op(x1, x2)
diff --git a/dpnp/tests/tensor/elementwise/test_equal.py b/dpnp/tests/tensor/elementwise/test_equal.py
new file mode 100644
index 000000000000..f5e0cd520762
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_equal.py
@@ -0,0 +1,207 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_equal_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.equal(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, True, dtype=r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.equal(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.equal(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, True, dtype=r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_equal_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.equal(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.equal(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.equal(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.equal(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.equal(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.equal(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_equal_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    r = dpt.equal(m, v)
+    expected = np.full((100, 5), [False, True, False, False, False], dtype="?")
+
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.equal(v, m)
+    assert (dpt.asnumpy(r2) == expected).all()
+
+    r3 = dpt.empty_like(m, dtype="?")
+    dpt.equal(m, v, out=r3)
+    assert (dpt.asnumpy(r3) == expected).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_equal_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        R = dpt.equal(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        assert dpt.all(R)
+        R = dpt.equal(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+        assert dpt.all(R)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_equal_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.equal(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_equal_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.equal(a, c)
diff --git a/dpnp/tests/tensor/elementwise/test_exp.py b/dpnp/tests/tensor/elementwise/test_exp.py
new file mode 100644
index 000000000000..ca204128317e
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_exp.py
@@ -0,0 +1,254 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.exp(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.exp(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_exp_real_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    Xnp = np.linspace(0.01, 88.1, num=n_seq, dtype=dtype)
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt.exp(X)
+    with np.errstate(all="ignore"):
+        Ynp = np.exp(Xnp)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(Y), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt.exp(X, out=Z)
+
+    assert_allclose(dpt.asnumpy(Z), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
+
+
+@pytest.mark.filterwarnings("ignore::RuntimeWarning")
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_exp_complex_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    low = -88.0
+    high = 88.0
+    x1 = np.random.uniform(low=low, high=high, size=n_seq)
+    x2 = np.random.uniform(low=low, high=high, size=n_seq)
+    Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
+
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt.exp(X)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(Y), np.repeat(np.exp(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt.exp(X, out=Z)
+
+    assert_allclose(
+        dpt.asnumpy(Z), np.repeat(np.exp(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_exp_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 16.0
+    X[..., 1::2] = 23.0
+
+    Y = dpt.exp(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.exp(np.float32(16.0))
+    expected_Y[..., 1::2] = np.exp(np.float32(23.0))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 8.0
+    X[..., 1::2] = 11.0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.exp(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.exp(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_exp_analytical_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    log2_ = 0.69314718055994530943
+    Xnp = np.array(x, dtype=dtype) * log2_
+    X = dpt.asarray(Xnp, dtype=dtype)
+    assert_allclose(dpt.asnumpy(dpt.exp(X)), np.exp(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_exp_real_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    x = [np.nan, np.inf, -np.inf, 0.0, -0.0]
+    Xnp = np.array(x, dtype=dtype)
+    X = dpt.asarray(x, dtype=dtype)
+
+    Y = dpt.asnumpy(dpt.exp(X))
+    Ynp = np.exp(Xnp)
+    assert_allclose(Y, Ynp, atol=tol, rtol=tol)
+    assert_array_equal(np.signbit(Y), np.signbit(Ynp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_exp_real_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    for ii in sizes:
+        Xnp = np.random.uniform(low=0.01, high=88.1, size=ii)
+        Xnp.astype(dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np.exp(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt.exp(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_exp_complex_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    low = -88.0
+    high = 88.0
+    for ii in sizes:
+        x1 = np.random.uniform(low=low, high=high, size=ii)
+        x2 = np.random.uniform(low=low, high=high, size=ii)
+        Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np.exp(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt.exp(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_exp_complex_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, np.inf, -np.inf, +0.0, -0.0, +1.0, -1.0]
+    xc = [complex(*val) for val in itertools.product(x, repeat=2)]
+
+    Xc_np = np.array(xc, dtype=dtype)
+    Xc = dpt.asarray(Xc_np, dtype=dtype, sycl_queue=q)
+
+    with np.errstate(all="ignore"):
+        Ynp = np.exp(Xc_np)
+    Y = dpt.exp(Xc)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.real(Y)), np.real(Ynp), atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(dpt.imag(Y)), np.imag(Ynp), atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_exp2.py b/dpnp/tests/tensor/elementwise/test_exp2.py
new file mode 100644
index 000000000000..ae2ab43c39be
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_exp2.py
@@ -0,0 +1,187 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp2_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.exp2(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.exp2(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_exp2_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.exp2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_exp2_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.exp2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_exp2_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1 / 4
+    X[..., 1::2] = 1 / 2
+
+    Y = dpt.exp2(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.exp2(np.float32(1 / 4))
+    expected_Y[..., 1::2] = np.exp2(np.float32(1 / 2))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp2_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1 / 4
+    X[..., 1::2] = 1 / 2
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.exp2(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.exp2(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+def test_exp2_special_cases():
+    get_queue_or_skip()
+
+    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = np.asarray([np.nan, 1.0, 1.0, np.inf, 0.0], dtype="f4")
+
+    tol = dpt.finfo(X.dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol)
+
+    # special cases for complex variant
+    num_finite = 1.0
+    vals = [
+        complex(0.0, 0.0),
+        complex(num_finite, dpt.inf),
+        complex(num_finite, dpt.nan),
+        complex(dpt.inf, 0.0),
+        complex(-dpt.inf, num_finite),
+        complex(dpt.inf, num_finite),
+        complex(-dpt.inf, dpt.inf),
+        complex(dpt.inf, dpt.inf),
+        complex(-dpt.inf, dpt.nan),
+        complex(dpt.inf, dpt.nan),
+        complex(dpt.nan, 0.0),
+        complex(dpt.nan, num_finite),
+        complex(dpt.nan, dpt.nan),
+    ]
+    X = dpt.asarray(vals, dtype=dpt.complex64)
+    cis_1 = complex(np.cos(num_finite), np.sin(num_finite))
+    c_nan = complex(np.nan, np.nan)
+    res = np.asarray(
+        [
+            complex(1.0, 0.0),
+            c_nan,
+            c_nan,
+            complex(np.inf, 0.0),
+            0.0,
+            np.inf * cis_1,
+            complex(0.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(0.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(np.nan, 0.0),
+            c_nan,
+            c_nan,
+        ],
+        dtype=np.complex64,
+    )
+
+    tol = dpt.finfo(X.dtype).resolution
+    with np.errstate(invalid="ignore"):
+        assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_expm1.py b/dpnp/tests/tensor/elementwise/test_expm1.py
new file mode 100644
index 000000000000..bb665c424564
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_expm1.py
@@ -0,0 +1,187 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_expm1_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.expm1(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.expm1(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_expm1_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(-2, 2, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.expm1(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.expm1(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_expm1_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(-2, 2, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.expm1(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.expm1(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_expm1_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1 / 50
+    X[..., 1::2] = 1 / 25
+
+    Y = dpt.expm1(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.expm1(np.float32(1 / 50))
+    expected_Y[..., 1::2] = np.expm1(np.float32(1 / 25))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_expm1_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1 / 50
+    X[..., 1::2] = 1 / 25
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.expm1(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.expm1(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+def test_expm1_special_cases():
+    get_queue_or_skip()
+
+    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = np.asarray([np.nan, 0.0, -0.0, np.inf, -1.0], dtype="f4")
+
+    tol = dpt.finfo(X.dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.expm1(X)), res, atol=tol, rtol=tol)
+
+    # special cases for complex variant
+    num_finite = 1.0
+    vals = [
+        complex(0.0, 0.0),
+        complex(num_finite, dpt.inf),
+        complex(num_finite, dpt.nan),
+        complex(dpt.inf, 0.0),
+        complex(-dpt.inf, num_finite),
+        complex(dpt.inf, num_finite),
+        complex(-dpt.inf, dpt.inf),
+        complex(dpt.inf, dpt.inf),
+        complex(-dpt.inf, dpt.nan),
+        complex(dpt.inf, dpt.nan),
+        complex(dpt.nan, 0.0),
+        complex(dpt.nan, num_finite),
+        complex(dpt.nan, dpt.nan),
+    ]
+    X = dpt.asarray(vals, dtype=dpt.complex64)
+    cis_1 = complex(np.cos(num_finite), np.sin(num_finite))
+    c_nan = complex(np.nan, np.nan)
+    res = np.asarray(
+        [
+            complex(0.0, 0.0),
+            c_nan,
+            c_nan,
+            complex(np.inf, 0.0),
+            0.0 * cis_1 - 1.0,
+            np.inf * cis_1 - 1.0,
+            complex(-1.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(-1.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(np.nan, 0.0),
+            c_nan,
+            c_nan,
+        ],
+        dtype=np.complex64,
+    )
+
+    tol = dpt.finfo(X.dtype).resolution
+    with np.errstate(invalid="ignore"):
+        assert_allclose(dpt.asnumpy(dpt.expm1(X)), res, atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_floor_ceil_trunc.py b/dpnp/tests/tensor/elementwise/test_floor_ceil_trunc.py
new file mode 100644
index 000000000000..f9af864b29fe
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_floor_ceil_trunc.py
@@ -0,0 +1,182 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+import re
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _map_to_device_dtype,
+    _no_complex_dtypes,
+    _real_value_dtypes,
+)
+
+_all_funcs = [(np.floor, dpt.floor), (np.ceil, dpt.ceil), (np.trunc, dpt.trunc)]
+
+
+@pytest.mark.parametrize("dpt_call", [dpt.floor, dpt.ceil, dpt.trunc])
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_floor_ceil_trunc_out_type(dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.asarray(0.1, dtype=arg_dt, sycl_queue=q)
+    expected_dtype = _map_to_device_dtype(arg_dt, q.sycl_device)
+    assert dpt_call(X).dtype == expected_dtype
+
+    X = dpt.asarray(0.1, dtype=dtype, sycl_queue=q)
+    expected_dtype = _map_to_device_dtype(arg_dt, q.sycl_device)
+    Y = dpt.empty_like(X, dtype=expected_dtype)
+    dpt_call(X, out=Y)
+    assert_allclose(dpt.asnumpy(dpt_call(X)), dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+def test_floor_ceil_trunc_usm_type(np_call, dpt_call, usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = -0.4
+    X[..., 1::2] = 0.7
+
+    Y = dpt_call(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np_call(dpt.asnumpy(X))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_floor_ceil_trunc_order(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (4, 4, 4, 4)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = -0.4
+    X[..., 1::2] = 0.7
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np_call(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt_call(U, order=ord)
+            assert_allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dpt_call", [dpt.floor, dpt.ceil, dpt.trunc])
+@pytest.mark.parametrize("dtype", _real_value_dtypes)
+def test_floor_ceil_trunc_error_dtype(dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.zeros(5, dtype=dtype)
+    y = dpt.empty_like(x, dtype="b1")
+    with pytest.raises(ValueError) as excinfo:
+        dpt_call(x, out=y)
+    assert re.match("Output array of type.*is needed", str(excinfo.value))
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_floor_ceil_trunc_contig(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    Xnp = np.linspace(-99.9, 99.9, num=n_seq, dtype=dtype)
+
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt_call(X)
+
+    assert_allclose(dpt.asnumpy(Y), np.repeat(np_call(Xnp), n_rep))
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt_call(X, out=Z)
+
+    assert_allclose(dpt.asnumpy(Z), np.repeat(np_call(Xnp), n_rep))
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_floor_ceil_trunc_strided(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 24, 32, 72]
+
+    for ii in sizes:
+        Xnp = np.random.uniform(low=-99.9, high=99.9, size=ii)
+        Xnp.astype(dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np_call(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt_call(X[::jj])),
+                Ynp[::jj],
+            )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_floor_ceil_trunc_special_cases(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, np.inf, -np.inf, +0.0, -0.0]
+
+    xf = np.array(x, dtype=dtype)
+    yf = dpt.asarray(xf, dtype=dtype, sycl_queue=q)
+
+    Y_np = np_call(xf)
+    Y = dpt.asnumpy(dpt_call(yf))
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(Y, Y_np, atol=tol, rtol=tol)
+    assert_array_equal(np.signbit(Y), np.signbit(Y_np))
diff --git a/dpnp/tests/tensor/elementwise/test_floor_divide.py b/dpnp/tests/tensor/elementwise/test_floor_divide.py
new file mode 100644
index 000000000000..5762b09afdb3
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_floor_divide.py
@@ -0,0 +1,317 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _integral_dtypes,
+    _no_complex_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_floor_divide_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.floor_divide(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.floor_divide(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.floor_divide(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.floor_divide(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_floor_divide_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.floor_divide(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_floor_divide_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.floor_divide(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.floor_divide(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.floor_divide(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.floor_divide(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.floor_divide(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.floor_divide(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.floor_divide(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.floor_divide(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.floor_divide(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.floor_divide(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_floor_divide_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.floor_divide(m, v)
+
+    expected = np.floor_divide(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.floor_divide(v, m)
+    expected2 = np.floor_divide(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
+def test_floor_divide_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.floor_divide(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.floor_divide(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_floor_divide_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.floor_divide(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_floor_divide_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.floor_divide(a, c)
+
+
+def test_floor_divide_gh_1247():
+    get_queue_or_skip()
+
+    x = dpt.ones(1, dtype="i4")
+    res = dpt.floor_divide(x, -2)
+    np.testing.assert_array_equal(
+        dpt.asnumpy(res), np.full(res.shape, -1, dtype=res.dtype)
+    )
+
+    x = dpt.full(1, -1, dtype="i4")
+    res = dpt.floor_divide(x, 2)
+    np.testing.assert_array_equal(
+        dpt.asnumpy(res), np.full(res.shape, -1, dtype=res.dtype)
+    )
+
+
+@pytest.mark.parametrize("dtype", _integral_dtypes)
+def test_floor_divide_integer_zero(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.arange(10, dtype=dtype, sycl_queue=q)
+    y = dpt.zeros_like(x, sycl_queue=q)
+    res = dpt.floor_divide(x, y)
+    np.testing.assert_array_equal(
+        dpt.asnumpy(res), np.zeros(x.shape, dtype=res.dtype)
+    )
+
+
+def test_floor_divide_special_cases():
+    q = get_queue_or_skip()
+
+    x = dpt.empty(1, dtype="f4", sycl_queue=q)
+    y = dpt.empty_like(x)
+    x[0], y[0] = dpt.inf, dpt.inf
+    res = dpt.floor_divide(x, y)
+    with np.errstate(all="ignore"):
+        res_np = np.floor_divide(dpt.asnumpy(x), dpt.asnumpy(y))
+        np.testing.assert_array_equal(dpt.asnumpy(res), res_np)
+
+    x[0], y[0] = 0.0, -1.0
+    res = dpt.floor_divide(x, y)
+    x_np = dpt.asnumpy(x)
+    y_np = dpt.asnumpy(y)
+    res_np = np.floor_divide(x_np, y_np)
+    np.testing.assert_array_equal(dpt.asnumpy(res), res_np)
+
+    res = dpt.floor_divide(y, x)
+    with np.errstate(all="ignore"):
+        res_np = np.floor_divide(y_np, x_np)
+        np.testing.assert_array_equal(dpt.asnumpy(res), res_np)
+
+    x[0], y[0] = -1.0, dpt.inf
+    res = dpt.floor_divide(x, y)
+    np.testing.assert_array_equal(
+        dpt.asnumpy(res), np.asarray([-0.0], dtype="f4")
+    )
+
+    res = dpt.floor_divide(y, x)
+    np.testing.assert_array_equal(
+        dpt.asnumpy(res), np.asarray([-dpt.inf], dtype="f4")
+    )
+
+    x[0], y[0] = 1.0, dpt.nan
+    res = dpt.floor_divide(x, y)
+    res_np = np.floor_divide(dpt.asnumpy(x), dpt.asnumpy(y))
+    np.testing.assert_array_equal(dpt.asnumpy(res), res_np)
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:])
+def test_divide_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X //= int(1)
+    elif dt_kind == "f":
+        X //= float(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_floor_divide_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    # out array only valid if it is inexact
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 //= ar2
+        assert dpt.all(ar1 == 1)
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
+        ar3 //= ar4
+        assert dpt.all(ar3 == 1)
+    else:
+        with pytest.raises(ValueError):
+            ar1 //= ar2
+            dpt.floor_divide(ar1, ar2, out=ar1)
diff --git a/dpnp/tests/tensor/elementwise/test_greater.py b/dpnp/tests/tensor/elementwise/test_greater.py
new file mode 100644
index 000000000000..eb5f2b3929df
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_greater.py
@@ -0,0 +1,314 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_greater_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.zeros(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.greater(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.greater(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.greater(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.greater(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_greater_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 10, sz)
+    ar1_np_imag = np.random.randint(0, 10, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 10, sz)
+    ar2_np_imag = np.random.randint(0, 10, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.greater(ar1, ar2)
+    expected = np.greater(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.greater(ar1[::-2], ar2[::2])
+    expected1 = np.greater(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype)
+    ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+
+    r2 = dpt.greater(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.greater(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.greater(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.greater(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_greater_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8")
+    ar2 = dpt.full((4,), 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar2)
+
+    r = dpt.greater(ar1, ar2)
+    expected = np.greater(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.greater(ar2, ar1)
+    expected1 = np.greater(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [dpt.nan, dpt.inf, -dpt.inf]:
+
+            ar3 = dpt.full((4,), tp)
+            ar3_np = dpt.asnumpy(ar3)
+
+            r2 = dpt.greater(ar1, ar3)
+            expected2 = np.greater(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.greater(ar3, ar1)
+            expected3 = np.greater(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_greater_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.greater(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_greater_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.greater(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.greater(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.greater(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.greater(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.greater(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.greater(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.greater(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.greater(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.greater(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.greater(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_greater_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.greater(m, v)
+
+    expected = np.greater(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.greater(v, m)
+    expected2 = np.greater(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_greater_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.greater(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.greater(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_greater_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.greater(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_greater_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.greater(a, c)
+
+
+def test_greater_mixed_integer_kinds():
+    get_queue_or_skip()
+
+    x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8"))
+    x2 = dpt.arange(10, dtype="u8")
+
+    # u8 - i8
+    res = dpt.greater(x2, x1)
+    assert dpt.all(res[1:])
+    assert not res[0]
+    # i8 - u8
+    assert not dpt.any(dpt.greater(x1, x2))
+
+    # Python scalar
+    assert dpt.all(dpt.greater(x2, -1))
+    assert not dpt.any(dpt.greater(-1, x2))
+
+
+def test_greater_very_large_py_int():
+    get_queue_or_skip()
+
+    py_int = dpt.iinfo(dpt.int64).max + 10
+
+    x = dpt.asarray(3, dtype="u8")
+    assert py_int > x
+    assert not dpt.greater(x, py_int)
+
+    x = dpt.asarray(py_int, dtype="u8")
+    assert x > -1
+    assert not dpt.greater(-1, x)
diff --git a/dpnp/tests/tensor/elementwise/test_greater_equal.py b/dpnp/tests/tensor/elementwise/test_greater_equal.py
new file mode 100644
index 000000000000..f2e97bf62189
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_greater_equal.py
@@ -0,0 +1,313 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_greater_equal_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.zeros(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.greater_equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.greater_equal(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.greater_equal(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.greater_equal(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_greater_equal_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 10, sz)
+    ar1_np_imag = np.random.randint(0, 10, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 10, sz)
+    ar2_np_imag = np.random.randint(0, 10, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.greater_equal(ar1, ar2)
+    expected = np.greater_equal(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.greater_equal(ar1[::-2], ar2[::2])
+    expected1 = np.greater_equal(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype)
+    ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+    r2 = dpt.greater_equal(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.greater_equal(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.greater_equal(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.greater_equal(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_greater_equal_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8")
+    ar2 = dpt.full((4,), 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar2)
+
+    r = dpt.greater_equal(ar1, ar2)
+    expected = np.greater_equal(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.greater_equal(ar2, ar1)
+    expected1 = np.greater_equal(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [dpt.nan, dpt.inf, -dpt.inf]:
+
+            ar3 = dpt.full((4,), tp)
+            ar3_np = dpt.asnumpy(ar3)
+            r2 = dpt.greater_equal(ar1, ar3)
+            expected2 = np.greater_equal(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.greater_equal(ar3, ar1)
+            expected3 = np.greater_equal(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_greater_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.greater_equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_greater_equal_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.greater_equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.greater_equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.greater_equal(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.greater_equal(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.greater_equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.greater_equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.greater_equal(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.greater_equal(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.greater_equal(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.greater_equal(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_greater_equal_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.greater_equal(m, v)
+
+    expected = np.greater_equal(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.greater_equal(v, m)
+    expected2 = np.greater_equal(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_greater_equal_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.greater_equal(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.greater_equal(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_greater_equal_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.greater_equal(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_greater_equal_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.greater_equal(a, c)
+
+
+def test_greater_equal_mixed_integer_kinds():
+    get_queue_or_skip()
+
+    x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8"))
+    x2 = dpt.arange(10, dtype="u8")
+
+    # u8 - i8
+    res = dpt.greater_equal(x2, x1)
+    assert dpt.all(res)
+    # i8 - u8
+    res = dpt.greater_equal(x1, x2)
+    assert not dpt.any(res[1:])
+    assert res[0]
+
+    # Python scalar
+    assert dpt.all(dpt.greater_equal(x2, -1))
+    assert not dpt.any(dpt.greater_equal(-1, x2))
+
+
+def test_greater_equal_very_large_py_int():
+    get_queue_or_skip()
+
+    py_int = dpt.iinfo(dpt.int64).max + 10
+
+    x = dpt.asarray(3, dtype="u8")
+    assert py_int >= x
+    assert not dpt.greater_equal(x, py_int)
+
+    x = dpt.asarray(py_int, dtype="u8")
+    assert x >= -1
+    assert not dpt.greater_equal(-1, x)
diff --git a/dpnp/tests/tensor/elementwise/test_hyperbolic.py b/dpnp/tests/tensor/elementwise/test_hyperbolic.py
new file mode 100644
index 000000000000..b94c5ede3f2a
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_hyperbolic.py
@@ -0,0 +1,202 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+)
+
+_hyper_funcs = [(np.sinh, dpt.sinh), (np.cosh, dpt.cosh), (np.tanh, dpt.tanh)]
+_inv_hyper_funcs = [
+    (np.arcsinh, dpt.asinh),
+    (np.arccosh, dpt.acosh),
+    (np.arctanh, dpt.atanh),
+]
+_all_funcs = _hyper_funcs + _inv_hyper_funcs
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_hyper_out_type(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    a = 1 if np_call == np.arccosh else 0
+
+    x = dpt.asarray(a, dtype=dtype, sycl_queue=q)
+    expected_dtype = np_call(np.array(a, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt_call(x).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_hyper_real_contig(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    if np_call == np.arctanh:
+        Xnp = np.linspace(-0.9, 0.9, num=n_seq, dtype=dtype)
+    elif np_call == np.arccosh:
+        Xnp = np.linspace(1.01, 10.0, num=n_seq, dtype=dtype)
+    else:
+        Xnp = np.linspace(-10.0, 10.0, num=n_seq, dtype=dtype)
+
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt_call(X)
+
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(Y), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt_call(X, out=Z)
+
+    assert_allclose(
+        dpt.asnumpy(Z), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_hyper_complex_contig(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    low = -9.0
+    high = 9.0
+    x1 = np.random.uniform(low=low, high=high, size=n_seq)
+    x2 = np.random.uniform(low=low, high=high, size=n_seq)
+    Xnp = x1 + 1j * x2
+
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt_call(X)
+
+    expected = np.repeat(np_call(Xnp), n_rep).astype(dtype)
+    tol = 50 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(Y), expected, atol=tol, rtol=tol)
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt_call(X, out=Z)
+
+    assert_allclose(dpt.asnumpy(Z), expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_hyper_real_strided(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    low = -10.0
+    high = 10.0
+    if np_call == np.arctanh:
+        low = -0.9
+        high = 0.9
+    elif np_call == np.arccosh:
+        low = 1.01
+        high = 100.0
+
+    for ii in sizes:
+        Xnp = np.random.uniform(low=low, high=high, size=ii)
+        Xnp.astype(dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np_call(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt_call(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_hyper_complex_strided(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 50 * dpt.finfo(dtype).resolution
+
+    low = -8.0
+    high = 8.0
+    for ii in sizes:
+        x1 = np.random.uniform(low=low, high=high, size=ii)
+        x2 = np.random.uniform(low=low, high=high, size=ii)
+        Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np_call(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt_call(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_hyper_real_special_cases(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, np.inf, -np.inf, 2.0, -2.0, +0.0, -0.0, +1.0, -1.0]
+
+    xf = np.array(x, dtype=dtype)
+    yf = dpt.asarray(xf, dtype=dtype, sycl_queue=q)
+
+    with np.errstate(all="ignore"):
+        Y_np = np_call(xf)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt_call(yf)), Y_np, atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_hypot.py b/dpnp/tests/tensor/elementwise/test_hypot.py
new file mode 100644
index 000000000000..bc87736318ee
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_hypot.py
@@ -0,0 +1,210 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _no_complex_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_hypot_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.zeros(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.zeros_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.hypot(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.hypot(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype, sycl_queue=q)
+    ar4 = dpt.zeros(2 * sz, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.hypot(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.hypot(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_hypot_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.hypot(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_hypot_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.hypot(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.hypot(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.hypot(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.hypot(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.hypot(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.hypot(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.hypot(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.hypot(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.hypot(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.hypot(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_hypot_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.hypot(m, v)
+
+    expected = np.hypot(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    tol = 8 * np.finfo(r.dtype).resolution
+    assert np.allclose(
+        dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
+    )
+
+    r2 = dpt.hypot(v, m)
+    expected2 = np.hypot(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert np.allclose(
+        dpt.asnumpy(r2), expected2.astype(r2.dtype), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
+def test_hypot_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.hypot(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.hypot(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_hypot_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.hypot(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_hypot_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.hypot(a, c)
diff --git a/dpnp/tests/tensor/elementwise/test_isfinite.py b/dpnp/tests/tensor/elementwise/test_isfinite.py
new file mode 100644
index 000000000000..f3a6664e6916
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_isfinite.py
@@ -0,0 +1,114 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_isfinite_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    assert dpt.isfinite(X).dtype == dpt.bool
+
+
+def test_isfinite_output():
+    q = get_queue_or_skip()
+
+    Xnp = np.asarray(np.nan)
+    X = dpt.asarray(np.nan, sycl_queue=q)
+    assert dpt.asnumpy(dpt.isfinite(X)) == np.isfinite(Xnp)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_isfinite_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    y1 = complex(np.nan, np.nan)
+    y2 = complex(1, np.nan)
+    y3 = complex(np.nan, 1)
+    y4 = complex(2, 1)
+    y5 = complex(np.inf, 1)
+
+    Ynp = np.repeat(np.array([y1, y2, y3, y4, y5], dtype=dtype), 12)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+    assert np.array_equal(dpt.asnumpy(dpt.isfinite(Y)), np.isfinite(Ynp))
+
+    r = dpt.empty_like(Y, dtype="bool")
+    dpt.isfinite(Y, out=r)
+    assert np.array_equal(dpt.asnumpy(r)[()], np.isfinite(Ynp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_isfinite_floats(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    y1 = np.nan
+    y2 = 1
+    y3 = np.inf
+
+    for mult in [123, 137, 255, 271, 272]:
+        Ynp = np.repeat(np.array([y1, y2, y3], dtype=dtype), mult)
+        Y = dpt.asarray(Ynp, sycl_queue=q)
+        assert np.array_equal(dpt.asnumpy(dpt.isfinite(Y)), np.isfinite(Ynp))
+
+        r = dpt.empty_like(Y, dtype="bool")
+        dpt.isfinite(Y, out=r)
+        assert np.array_equal(dpt.asnumpy(r)[()], np.isfinite(Ynp))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_isfinite_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q)
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms)
+        expected_Y = np.full(U.shape, fill_value=True, dtype=dpt.bool)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.isfinite(U, order=ord)
+            assert_allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpnp/tests/tensor/elementwise/test_isinf.py b/dpnp/tests/tensor/elementwise/test_isinf.py
new file mode 100644
index 000000000000..91b2e9420446
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_isinf.py
@@ -0,0 +1,108 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_isinf_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    assert dpt.isinf(X).dtype == dpt.bool
+
+
+def test_isinf_output():
+    q = get_queue_or_skip()
+
+    Xnp = np.asarray(np.inf)
+    X = dpt.asarray(np.inf, sycl_queue=q)
+    assert dpt.asnumpy(dpt.isinf(X)) == np.isinf(Xnp)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_isinf_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    y1 = complex(np.inf, np.inf)
+    y2 = complex(1, np.inf)
+    y3 = complex(np.inf, 1)
+    y4 = complex(2, 1)
+    y5 = complex(np.inf, 1)
+    y6 = complex(np.inf, np.nan)
+
+    Ynp = np.repeat(np.array([y1, y2, y3, y4, y5, y6], dtype=dtype), 123)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+    assert np.array_equal(dpt.asnumpy(dpt.isinf(Y)), np.isinf(Ynp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_isinf_floats(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    y1 = np.nan
+    y2 = 1
+    y3 = np.inf
+    y4 = -np.inf
+
+    for mult in [123, 137, 255, 271, 272]:
+        Ynp = np.repeat(np.array([y1, y2, y3, y4], dtype=dtype), mult)
+        Y = dpt.asarray(Ynp, sycl_queue=q)
+        assert np.array_equal(dpt.asnumpy(dpt.isinf(Y)), np.isinf(Ynp))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_isinf_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q)
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms)
+        expected_Y = np.full(U.shape, fill_value=False, dtype=dpt.bool)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.isinf(U, order=ord)
+            assert_allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpnp/tests/tensor/elementwise/test_isnan.py b/dpnp/tests/tensor/elementwise/test_isnan.py
new file mode 100644
index 000000000000..fe6f2660734a
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_isnan.py
@@ -0,0 +1,113 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_isnan_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    assert dpt.isnan(X).dtype == dpt.bool
+
+
+def test_isnan_output():
+    q = get_queue_or_skip()
+
+    Xnp = np.asarray(np.nan)
+    X = dpt.asarray(np.nan, sycl_queue=q)
+    assert dpt.asnumpy(dpt.isnan(X)) == np.isnan(Xnp)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_isnan_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    y1 = complex(np.nan, np.nan)
+    y2 = complex(1, np.nan)
+    y3 = complex(np.nan, 1)
+    y4 = complex(2, 1)
+    y5 = complex(np.inf, 1)
+
+    Ynp = np.repeat(np.array([y1, y2, y3, y4, y5], dtype=dtype), 123)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+    assert np.array_equal(dpt.asnumpy(dpt.isnan(Y)), np.isnan(Ynp))
+
+    r = dpt.empty_like(Y, dtype="bool")
+    dpt.isnan(Y, out=r)
+    assert np.array_equal(dpt.asnumpy(r)[()], np.isnan(Ynp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_isnan_floats(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    y1 = np.nan
+    y2 = 1
+    y3 = np.inf
+
+    for mult in [123, 137, 255, 271, 272]:
+        Ynp = np.repeat(np.array([y1, y2, y3], dtype=dtype), mult)
+        Y = dpt.asarray(Ynp, sycl_queue=q)
+        assert np.array_equal(dpt.asnumpy(dpt.isnan(Y)), np.isnan(Ynp))
+
+        r = dpt.empty_like(Y, dtype="bool")
+        dpt.isnan(Y, out=r)
+        assert np.array_equal(dpt.asnumpy(r)[()], np.isnan(Ynp))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_isnan_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q)
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms)
+        expected_Y = np.full(U.shape, fill_value=False, dtype=dpt.bool)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.isnan(U, order=ord)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpnp/tests/tensor/elementwise/test_less.py b/dpnp/tests/tensor/elementwise/test_less.py
new file mode 100644
index 000000000000..0abf1e440643
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_less.py
@@ -0,0 +1,314 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_less_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.zeros(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.less(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.less(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.less(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.less(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_less_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 10, sz)
+    ar1_np_imag = np.random.randint(0, 10, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 10, sz)
+    ar2_np_imag = np.random.randint(0, 10, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.less(ar1, ar2)
+    expected = np.less(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.less(ar1[::-2], ar2[::2])
+    expected1 = np.less(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype)
+    ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+
+    r2 = dpt.less(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.less(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.less(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.less(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_less_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8")
+    ar2 = dpt.full((4,), 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar2)
+
+    r = dpt.less(ar1, ar2)
+    expected = np.less(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.less(ar2, ar1)
+    expected1 = np.less(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [dpt.nan, dpt.inf, -dpt.inf]:
+
+            ar3 = dpt.full((4,), tp)
+            ar3_np = dpt.asnumpy(ar3)
+
+            r2 = dpt.less(ar1, ar3)
+            expected2 = np.less(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.less(ar3, ar1)
+            expected3 = np.less(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_less_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.less(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_less_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.less(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.less(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.less(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.less(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.less(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.less(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.less(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.less(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.less(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.less(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_less_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.less(m, v)
+
+    expected = np.less(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.less(v, m)
+    expected2 = np.less(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_less_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.less(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.less(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_less_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.less(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_less_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.less(a, c)
+
+
+def test_less_mixed_integer_kinds():
+    get_queue_or_skip()
+
+    x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8"))
+    x2 = dpt.arange(10, dtype="u8")
+
+    # u8 - i8
+    assert not dpt.any(dpt.less(x2, x1))
+    # i8 - u8
+    res = dpt.less(x1, x2)
+    assert not res[0]
+    assert dpt.all(res[1:])
+
+    # Python scalar
+    assert not dpt.any(dpt.less(x2, -1))
+    assert dpt.all(dpt.less(-1, x2))
+
+
+def test_less_very_large_py_int():
+    get_queue_or_skip()
+
+    py_int = dpt.iinfo(dpt.int64).max + 10
+
+    x = dpt.asarray(3, dtype="u8")
+    assert not py_int < x
+    assert dpt.less(x, py_int)
+
+    x = dpt.asarray(py_int, dtype="u8")
+    assert not x < -1
+    assert dpt.less(-1, x)
diff --git a/dpnp/tests/tensor/elementwise/test_less_equal.py b/dpnp/tests/tensor/elementwise/test_less_equal.py
new file mode 100644
index 000000000000..1a5744475210
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_less_equal.py
@@ -0,0 +1,313 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_less_equal_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.zeros(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.less_equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.less_equal(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.less_equal(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.less_equal(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_less_equal_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 10, sz)
+    ar1_np_imag = np.random.randint(0, 10, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 10, sz)
+    ar2_np_imag = np.random.randint(0, 10, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.less_equal(ar1, ar2)
+    expected = np.less_equal(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.less_equal(ar1[::-2], ar2[::2])
+    expected1 = np.less_equal(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype)
+    ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+
+    r2 = dpt.less_equal(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.less_equal(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.less_equal(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.less_equal(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_less_equal_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8")
+    ar2 = dpt.full((4,), 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar2)
+
+    r = dpt.less_equal(ar1, ar2)
+    expected = np.less_equal(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.less_equal(ar2, ar1)
+    expected1 = np.less_equal(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [dpt.nan, dpt.inf, -dpt.inf]:
+
+            ar3 = dpt.full((4,), tp)
+            ar3_np = dpt.asnumpy(ar3)
+            r2 = dpt.less_equal(ar1, ar3)
+            expected2 = np.less_equal(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.less_equal(ar3, ar1)
+            expected3 = np.less_equal(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_less_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.less_equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_less_equal_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.less_equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.less_equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.less_equal(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.less_equal(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.less_equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.less_equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.less_equal(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.less_equal(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.less_equal(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.less_equal(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_less_equal_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.less_equal(m, v)
+
+    expected = np.less_equal(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.less_equal(v, m)
+    expected2 = np.less_equal(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_less_equal_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.less_equal(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.less_equal(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_less_equal_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.less_equal(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_less_equal_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.less_equal(a, c)
+
+
+def test_less_equal_mixed_integer_kinds():
+    get_queue_or_skip()
+
+    x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8"))
+    x2 = dpt.arange(10, dtype="u8")
+
+    # u8 - i8
+    res = dpt.less_equal(x2, x1)
+    assert res[0]
+    assert not dpt.any(res[1:])
+    # i8 - u8
+    assert dpt.all(dpt.less_equal(x1, x2))
+
+    # Python scalar
+    assert not dpt.any(dpt.less_equal(x2, -1))
+    assert dpt.all(dpt.less_equal(-1, x2))
+
+
+def test_less_equal_very_large_py_int():
+    get_queue_or_skip()
+
+    py_int = dpt.iinfo(dpt.int64).max + 10
+
+    x = dpt.asarray(3, dtype="u8")
+    assert not py_int <= x
+    assert dpt.less_equal(x, py_int)
+
+    x = dpt.asarray(py_int, dtype="u8")
+    assert not x <= -1
+    assert dpt.less_equal(-1, x)
diff --git a/dpnp/tests/tensor/elementwise/test_log.py b/dpnp/tests/tensor/elementwise/test_log.py
new file mode 100644
index 000000000000..b41fa85df05e
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_log.py
@@ -0,0 +1,149 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.log(np.array(1, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.log(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_log_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 4 * dpt.e
+    X[..., 1::2] = 10 * dpt.e
+
+    Y = dpt.log(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.log(np.float32(4 * dpt.e))
+    expected_Y[..., 1::2] = np.log(np.float32(10 * dpt.e))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 4 * dpt.e
+    X[..., 1::2] = 10 * dpt.e
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.log(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.log(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+def test_log_special_cases():
+    q = get_queue_or_skip()
+
+    X = dpt.asarray(
+        [dpt.nan, -dpt.inf, -1.0, -0.0, 0.0, dpt.inf], dtype="f4", sycl_queue=q
+    )
+    Y = dpt.log(X)
+
+    expected = np.array(
+        [np.nan, np.nan, np.nan, -np.inf, -np.inf, np.inf], dtype="f4"
+    )
+
+    assert_equal(dpt.asnumpy(Y), expected)
diff --git a/dpnp/tests/tensor/elementwise/test_log10.py b/dpnp/tests/tensor/elementwise/test_log10.py
new file mode 100644
index 000000000000..02c652293b9d
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_log10.py
@@ -0,0 +1,152 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.log10(np.array(1, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.log10(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log10(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    np.testing.assert_allclose(
+        dpt.asnumpy(Y), np.log10(Xnp), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log10(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    np.testing.assert_allclose(
+        dpt.asnumpy(Y), np.log10(Xnp), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_log_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 4 * dpt.e
+    X[..., 1::2] = 10 * dpt.e
+
+    Y = dpt.log10(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.log10(np.float32(4 * dpt.e))
+    expected_Y[..., 1::2] = np.log10(np.float32(10 * dpt.e))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 4 * dpt.e
+    X[..., 1::2] = 10 * dpt.e
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.log10(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.log10(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            np.testing.assert_allclose(
+                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
+            )
+
+
+def test_log_special_cases():
+    q = get_queue_or_skip()
+
+    X = dpt.asarray(
+        [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q
+    )
+    Xnp = dpt.asnumpy(X)
+
+    with np.errstate(invalid="ignore", divide="ignore"):
+        assert_equal(dpt.asnumpy(dpt.log10(X)), np.log10(Xnp))
diff --git a/dpnp/tests/tensor/elementwise/test_log1p.py b/dpnp/tests/tensor/elementwise/test_log1p.py
new file mode 100644
index 000000000000..eb6205650e10
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_log1p.py
@@ -0,0 +1,188 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log1p_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.log1p(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.log1p(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log1p_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(0, 2, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log1p(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.log1p(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log1p_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(0, 2, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log1p(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.log1p(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_log1p_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = dpt.e / 1000
+    X[..., 1::2] = dpt.e / 100
+
+    Y = dpt.log1p(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.log1p(np.float32(dpt.e / 1000))
+    expected_Y[..., 1::2] = np.log1p(np.float32(dpt.e / 100))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log1p_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = dpt.e / 1000
+    X[..., 1::2] = dpt.e / 100
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.log1p(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.log1p(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+def test_log1p_special_cases():
+    q = get_queue_or_skip()
+
+    X = dpt.asarray(
+        [dpt.nan, -2.0, -1.0, -0.0, 0.0, dpt.inf],
+        dtype="f4",
+        sycl_queue=q,
+    )
+    res = np.asarray([np.nan, np.nan, -np.inf, -0.0, 0.0, np.inf], dtype="f4")
+
+    tol = dpt.finfo(X.dtype).resolution
+    with np.errstate(divide="ignore", invalid="ignore"):
+        assert_allclose(dpt.asnumpy(dpt.log1p(X)), res, atol=tol, rtol=tol)
+
+    # special cases for complex
+    vals = [
+        complex(-1.0, 0.0),
+        complex(2.0, dpt.inf),
+        complex(2.0, dpt.nan),
+        complex(-dpt.inf, 1.0),
+        complex(dpt.inf, 1.0),
+        complex(-dpt.inf, dpt.inf),
+        complex(dpt.inf, dpt.inf),
+        complex(dpt.inf, dpt.nan),
+        complex(dpt.nan, 1.0),
+        complex(dpt.nan, dpt.inf),
+        complex(dpt.nan, dpt.nan),
+    ]
+    X = dpt.asarray(vals, dtype=dpt.complex64)
+    c_nan = complex(np.nan, np.nan)
+    res = np.asarray(
+        [
+            complex(-np.inf, 0.0),
+            complex(np.inf, np.pi / 2),
+            c_nan,
+            complex(np.inf, np.pi),
+            complex(np.inf, 0.0),
+            complex(np.inf, 3 * np.pi / 4),
+            complex(np.inf, np.pi / 4),
+            complex(np.inf, np.nan),
+            c_nan,
+            complex(np.inf, np.nan),
+            c_nan,
+        ],
+        dtype=np.complex64,
+    )
+
+    tol = dpt.finfo(X.dtype).resolution
+    with np.errstate(invalid="ignore"):
+        dpt_res = dpt.asnumpy(dpt.log1p(X))
+        assert_allclose(np.real(dpt_res), np.real(res), atol=tol, rtol=tol)
+        assert_allclose(np.imag(dpt_res), np.imag(res), atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_log2.py b/dpnp/tests/tensor/elementwise/test_log2.py
new file mode 100644
index 000000000000..7cd2f4615133
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_log2.py
@@ -0,0 +1,148 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.log2(np.array(1, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.log2(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    np.testing.assert_allclose(dpt.asnumpy(Y), np.log2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.log2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    np.testing.assert_allclose(dpt.asnumpy(Y), np.log2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_log_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 4 * dpt.e
+    X[..., 1::2] = 10 * dpt.e
+
+    Y = dpt.log2(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.log2(np.float32(4 * dpt.e))
+    expected_Y[..., 1::2] = np.log2(np.float32(10 * dpt.e))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_log_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 4 * dpt.e
+    X[..., 1::2] = 10 * dpt.e
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.log2(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.log2(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            np.testing.assert_allclose(
+                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
+            )
+
+
+def test_log_special_cases():
+    q = get_queue_or_skip()
+
+    X = dpt.asarray(
+        [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q
+    )
+    Xnp = dpt.asnumpy(X)
+
+    with np.errstate(invalid="ignore", divide="ignore"):
+        assert_equal(dpt.asnumpy(dpt.log2(X)), np.log2(Xnp))
diff --git a/dpnp/tests/tensor/elementwise/test_logaddexp.py b/dpnp/tests/tensor/elementwise/test_logaddexp.py
new file mode 100644
index 000000000000..fc16c1722d98
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_logaddexp.py
@@ -0,0 +1,211 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+import re
+
+import dpctl
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _no_complex_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
+def test_logaddexp_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.logaddexp(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.logaddexp(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    tol = 8 * max(
+        np.finfo(r.dtype).resolution, np.finfo(expected.dtype).resolution
+    )
+    assert_allclose(
+        dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
+    )
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.logaddexp(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.logaddexp(dpt.asnumpy(ar3)[::-1], dpt.asnumpy(ar4)[::2])
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert_allclose(
+        dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_logaddexp_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.logaddexp(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_logaddexp_order():
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="C")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="C")
+        r1 = dpt.logaddexp(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.logaddexp(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.logaddexp(ar1, ar2, order="A")
+        assert r3.flags.c_contiguous
+        r4 = dpt.logaddexp(ar1, ar2, order="K")
+        assert r4.flags.c_contiguous
+
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="F")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="F")
+        r1 = dpt.logaddexp(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.logaddexp(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.logaddexp(ar1, ar2, order="A")
+        assert r3.flags.f_contiguous
+        r4 = dpt.logaddexp(ar1, ar2, order="K")
+        assert r4.flags.f_contiguous
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
+        r4 = dpt.logaddexp(ar1, ar2, order="K")
+        assert r4.strides == (n, -1)
+        r5 = dpt.logaddexp(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
+        r4 = dpt.logaddexp(ar1, ar2, order="K")
+        assert r4.strides == (-1, n)
+        r5 = dpt.logaddexp(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+
+def test_logaddexp_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.logaddexp(m, v)
+
+    expected = np.logaddexp(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.logaddexp(v, m)
+    expected2 = np.logaddexp(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+def test_logaddexp_broadcasting_error():
+    get_queue_or_skip()
+    m = dpt.ones((10, 10), dtype="i4")
+    v = dpt.ones((3,), dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.logaddexp(m, v)
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes)
+def test_logaddexp_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        R = dpt.logaddexp(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.logaddexp(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_logaddexp_dtype_error(
+    dtype,
+):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    ar1 = dpt.ones(5, dtype=dtype)
+    ar2 = dpt.ones_like(ar1, dtype="f4")
+
+    y = dpt.zeros_like(ar1, dtype="int8")
+    with pytest.raises(ValueError) as excinfo:
+        dpt.logaddexp(ar1, ar2, out=y)
+    assert re.match("Output array of type.*is needed", str(excinfo.value))
diff --git a/dpnp/tests/tensor/elementwise/test_logical_and.py b/dpnp/tests/tensor/elementwise/test_logical_and.py
new file mode 100644
index 000000000000..09f5838265af
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_logical_and.py
@@ -0,0 +1,321 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_logical_and_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op1_dtype)
+    ar2 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op2_dtype)
+
+    r = dpt.logical_and(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.logical_and(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_and(ar1, ar2, out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.logical_and(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.logical_and(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_and(ar3[::-1], ar4[::2], out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_logical_and_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 2, sz)
+    ar1_np_imag = np.random.randint(0, 2, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 2, sz)
+    ar2_np_imag = np.random.randint(0, 2, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.logical_and(ar1, ar2)
+    expected = np.logical_and(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_and(ar1[::-2], ar2[::2])
+    expected1 = np.logical_and(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray(
+        [
+            2.0 + 0j,
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ],
+        dtype=op_dtype,
+    )
+    ar4 = dpt.full(ar3.shape, fill_value=1.0 + 2j, dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+
+    r2 = dpt.logical_and(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.logical_and(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.logical_and(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.logical_and(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_logical_and_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8")
+    ar2 = dpt.full(ar1.shape, 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar2)
+
+    r = dpt.logical_and(ar1, ar2)
+    expected = np.logical_and(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_and(ar2, ar1)
+    expected1 = np.logical_and(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ]:
+            ar3 = dpt.full(ar1.shape, tp)
+            ar3_np = dpt.asnumpy(ar3)
+            r2 = dpt.logical_and(ar1, ar3)
+            expected2 = np.logical_and(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.logical_and(ar3, ar1)
+            expected3 = np.logical_and(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_logical_and_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype="i4", usm_type=op1_usm_type
+    )
+    ar2 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype=ar1.dtype, usm_type=op2_usm_type
+    )
+
+    r = dpt.logical_and(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_logical_and_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.logical_and(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_and(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_and(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.logical_and(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.logical_and(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_and(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_and(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.logical_and(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.logical_and(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.logical_and(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_logical_and_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.asarray(np.random.randint(0, 2, (100, 5)), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.logical_and(m, v)
+
+    expected = np.logical_and(dpt.asnumpy(m), dpt.asnumpy(v))
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.logical_and(v, m)
+    expected2 = np.logical_and(dpt.asnumpy(v), dpt.asnumpy(m))
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.empty_like(r)
+    dpt.logical_and(m, v, out=r3)
+    assert (dpt.asnumpy(r3) == expected).all()
+
+    r4 = dpt.empty_like(r)
+    dpt.logical_and(v, m, out=r4)
+    assert (dpt.asnumpy(r4) == expected).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+@pytest.mark.parametrize("scalar_val", [0, 1])
+def test_logical_and_python_scalar(arr_dt, scalar_val):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.asarray(
+        np.random.randint(0, 2, (10, 10)), dtype=arr_dt, sycl_queue=q
+    )
+    py_ones = (
+        bool(scalar_val),
+        int(scalar_val),
+        float(scalar_val),
+        complex(scalar_val),
+        np.float32(scalar_val),
+        ctypes.c_int(scalar_val),
+    )
+    for sc in py_ones:
+        R = dpt.logical_and(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        E = np.logical_and(dpt.asnumpy(X), sc)
+        assert (dpt.asnumpy(R) == E).all()
+
+        R = dpt.logical_and(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+        E = np.logical_and(sc, dpt.asnumpy(X))
+        assert (dpt.asnumpy(R) == E).all()
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_logical_and_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.logical_and(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_logical_and_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.logical_and(a, c)
diff --git a/dpnp/tests/tensor/elementwise/test_logical_not.py b/dpnp/tests/tensor/elementwise/test_logical_not.py
new file mode 100644
index 000000000000..fa1d5e786bd3
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_logical_not.py
@@ -0,0 +1,198 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op_dtype", _all_dtypes)
+def test_logical_not_dtype_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 7
+    ar1_np = np.random.randint(0, 2, sz)
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    r = dpt.logical_not(ar1)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.logical_not(ar1_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_not(ar1, out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+    ar2 = dpt.zeros(sz, dtype=op_dtype)
+    r = dpt.logical_not(ar2[::-1])
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.logical_not(np.zeros(ar2.shape, dtype=op_dtype))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar2.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    ar3 = dpt.ones(sz, dtype=op_dtype)
+    r2 = dpt.logical_not(ar3[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.logical_not(np.ones(ar3.shape, dtype=op_dtype)[::2])
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert (dpt.asnumpy(r2) == expected).all()
+
+    r3 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_not(ar2[::-1], out=r3)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r3)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_logical_not_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 2, sz)
+    ar1_np_imag = np.random.randint(0, 2, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    r = dpt.logical_not(ar1)
+    expected = np.logical_not(ar1_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_not(ar1[::-2])
+    expected1 = np.logical_not(ar1_np[::-2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar2 = dpt.asarray(
+        [
+            2.0 + 0j,
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ],
+        dtype=op_dtype,
+    )
+    ar2_np = dpt.asnumpy(ar2)
+    r2 = dpt.logical_not(ar2)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.logical_not(ar2_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+
+def test_logical_not_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8")
+
+    r = dpt.logical_not(ar1)
+    expected = np.logical_not(dpt.asnumpy(ar1))
+    assert (dpt.asnumpy(r) == expected).all()
+
+    with np.errstate(invalid="ignore"):
+        for tp in [
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ]:
+            ar2 = dpt.full(ar1.shape, tp)
+            r2 = dpt.logical_not(ar2)
+            expected2 = np.logical_not(dpt.asnumpy(ar2))
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+
+@pytest.mark.parametrize("op_usm_type", _usm_types)
+def test_logical_not_usm_type_matrix(op_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype="i4", usm_type=op_usm_type
+    )
+
+    r = dpt.logical_not(ar1)
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.usm_type == op_usm_type
+
+
+def test_logical_not_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.logical_not(ar1, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_not(ar1, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_not(ar1, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.logical_not(ar1, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.zeros((20, 20), dtype="i4", order="F")
+    r1 = dpt.logical_not(ar1, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_not(ar1, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_not(ar1, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.logical_not(ar1, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.logical_not(ar1, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.zeros((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.logical_not(ar1, order="K")
+    assert r4.strides == (-1, 20)
diff --git a/dpnp/tests/tensor/elementwise/test_logical_or.py b/dpnp/tests/tensor/elementwise/test_logical_or.py
new file mode 100644
index 000000000000..42c7e6f645b3
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_logical_or.py
@@ -0,0 +1,322 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_logical_or_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op1_dtype)
+    ar2 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op2_dtype)
+
+    r = dpt.logical_or(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.logical_or(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_or(ar1, ar2, out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.logical_or(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.logical_or(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_or(ar3[::-1], ar4[::2], out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_logical_or_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 2, sz)
+    ar1_np_imag = np.random.randint(0, 2, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 2, sz)
+    ar2_np_imag = np.random.randint(0, 2, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.logical_or(ar1, ar2)
+    expected = np.logical_or(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_or(ar1[::-2], ar2[::2])
+    expected1 = np.logical_or(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray(
+        [
+            2.0 + 0j,
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ],
+        dtype=op_dtype,
+    )
+    ar4 = dpt.full(ar3.shape, fill_value=1.0 + 2j, dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+
+    r2 = dpt.logical_or(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.logical_or(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.logical_or(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.logical_or(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_logical_or_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8")
+    ar2 = dpt.full(ar1.shape, 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar2)
+
+    r = dpt.logical_or(ar1, ar2)
+    expected = np.logical_or(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_or(ar2, ar1)
+    expected1 = np.logical_or(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ]:
+            ar3 = dpt.full(ar1.shape, tp)
+            ar3_np = dpt.asnumpy(ar3)
+
+            r2 = dpt.logical_or(ar1, ar3)
+            expected2 = np.logical_or(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.logical_or(ar3, ar1)
+            expected3 = np.logical_or(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_logical_or_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype="i4", usm_type=op1_usm_type
+    )
+    ar2 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype=ar1.dtype, usm_type=op2_usm_type
+    )
+
+    r = dpt.logical_or(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_logical_or_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.logical_or(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_or(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_or(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.logical_or(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.logical_or(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_or(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_or(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.logical_or(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.logical_or(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.logical_or(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_logical_or_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.asarray(np.random.randint(0, 2, (100, 5)), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.logical_or(m, v)
+
+    expected = np.logical_or(dpt.asnumpy(m), dpt.asnumpy(v))
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.logical_or(v, m)
+    expected2 = np.logical_or(dpt.asnumpy(v), dpt.asnumpy(m))
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.empty_like(r)
+    dpt.logical_or(m, v, out=r3)
+    assert (dpt.asnumpy(r3) == expected).all()
+
+    r4 = dpt.empty_like(r)
+    dpt.logical_or(v, m, out=r4)
+    assert (dpt.asnumpy(r4) == expected).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+@pytest.mark.parametrize("scalar_val", [0, 1])
+def test_logical_or_python_scalar(arr_dt, scalar_val):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.asarray(
+        np.random.randint(0, 2, (10, 10)), dtype=arr_dt, sycl_queue=q
+    )
+    py_ones = (
+        bool(scalar_val),
+        int(scalar_val),
+        float(scalar_val),
+        complex(scalar_val),
+        np.float32(scalar_val),
+        ctypes.c_int(scalar_val),
+    )
+    for sc in py_ones:
+        R = dpt.logical_or(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        E = np.logical_or(dpt.asnumpy(X), sc)
+        assert (dpt.asnumpy(R) == E).all()
+
+        R = dpt.logical_or(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+        E = np.logical_or(sc, dpt.asnumpy(X))
+        assert (dpt.asnumpy(R) == E).all()
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_logical_or_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.logical_or(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_logical_or_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.logical_or(a, c)
diff --git a/dpnp/tests/tensor/elementwise/test_logical_xor.py b/dpnp/tests/tensor/elementwise/test_logical_xor.py
new file mode 100644
index 000000000000..da2b79974f12
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_logical_xor.py
@@ -0,0 +1,323 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_logical_xor_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1_np = np.random.randint(0, 2, sz)
+    ar1 = dpt.asarray(ar1_np, dtype=op1_dtype)
+    ar2_np = np.random.randint(0, 2, sz)
+    ar2 = dpt.asarray(ar2_np, dtype=op2_dtype)
+
+    r = dpt.logical_xor(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    expected = np.logical_xor(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_xor(ar1, ar2, out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.logical_xor(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.logical_xor(
+        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.empty_like(r, dtype=r.dtype)
+    dpt.logical_xor(ar3[::-1], ar4[::2], out=r2)
+    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_logical_xor_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 2, sz)
+    ar1_np_imag = np.random.randint(0, 2, sz)
+    ar1_np = ar1_np_real + 1j * ar1_np_imag
+    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 2, sz)
+    ar2_np_imag = np.random.randint(0, 2, sz)
+    ar2_np = ar2_np_real + 1j * ar2_np_imag
+    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
+
+    r = dpt.logical_xor(ar1, ar2)
+    expected = np.logical_xor(ar1_np, ar2_np)
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_xor(ar1[::-2], ar2[::2])
+    expected1 = np.logical_xor(ar1_np[::-2], ar2_np[::2])
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert (dpt.asnumpy(r1) == expected1).all()
+
+    ar3 = dpt.asarray(
+        [
+            2.0 + 0j,
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ],
+        dtype=op_dtype,
+    )
+    ar4 = dpt.full(ar3.shape, fill_value=1.0 + 2j, dtype=op_dtype)
+
+    ar3_np = dpt.asnumpy(ar3)
+    ar4_np = dpt.asnumpy(ar4)
+
+    r2 = dpt.logical_xor(ar3, ar4)
+    with np.errstate(invalid="ignore"):
+        expected2 = np.logical_xor(ar3_np, ar4_np)
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.logical_xor(ar4, ar4)
+    with np.errstate(invalid="ignore"):
+        expected3 = np.logical_xor(ar4_np, ar4_np)
+    assert (dpt.asnumpy(r3) == expected3).all()
+
+
+def test_logical_xor_complex_float():
+    get_queue_or_skip()
+
+    ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8")
+    ar2 = dpt.full(ar1.shape, 2, dtype="f4")
+
+    ar1_np = dpt.asnumpy(ar1)
+    ar2_np = dpt.asnumpy(ar1)
+
+    r = dpt.logical_xor(ar1, ar2)
+    expected = np.logical_xor(ar1_np, ar2_np)
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r1 = dpt.logical_xor(ar2, ar1)
+    expected1 = np.logical_xor(ar2_np, ar1_np)
+    assert (dpt.asnumpy(r1) == expected1).all()
+    with np.errstate(invalid="ignore"):
+        for tp in [
+            dpt.nan,
+            dpt.nan * 1j,
+            dpt.inf,
+            dpt.inf * 1j,
+            -dpt.inf,
+            -dpt.inf * 1j,
+        ]:
+            ar3 = dpt.full(ar1.shape, tp)
+            ar3_np = dpt.asnumpy(ar3)
+            r2 = dpt.logical_xor(ar1, ar3)
+            expected2 = np.logical_xor(ar1_np, ar3_np)
+            assert (dpt.asnumpy(r2) == expected2).all()
+
+            r3 = dpt.logical_xor(ar3, ar1)
+            expected3 = np.logical_xor(ar3_np, ar1_np)
+            assert (dpt.asnumpy(r3) == expected3).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_logical_xor_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype="i4", usm_type=op1_usm_type
+    )
+    ar2 = dpt.asarray(
+        np.random.randint(0, 2, sz), dtype=ar1.dtype, usm_type=op2_usm_type
+    )
+
+    r = dpt.logical_xor(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_logical_xor_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.logical_xor(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_xor(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_xor(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.logical_xor(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.logical_xor(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.logical_xor(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.logical_xor(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.logical_xor(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.logical_xor(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.logical_xor(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_logical_xor_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.asarray(np.random.randint(0, 2, (100, 5)), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.logical_xor(m, v)
+
+    expected = np.logical_xor(dpt.asnumpy(m), dpt.asnumpy(v))
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.logical_xor(v, m)
+    expected2 = np.logical_xor(dpt.asnumpy(v), dpt.asnumpy(m))
+    assert (dpt.asnumpy(r2) == expected2).all()
+
+    r3 = dpt.empty_like(r)
+    dpt.logical_xor(m, v, out=r3)
+    assert (dpt.asnumpy(r3) == expected).all()
+
+    r4 = dpt.empty_like(r)
+    dpt.logical_xor(v, m, out=r4)
+    assert (dpt.asnumpy(r4) == expected).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+@pytest.mark.parametrize("scalar_val", [0, 1])
+def test_logical_xor_python_scalar(arr_dt, scalar_val):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.asarray(
+        np.random.randint(0, 2, (10, 10)), dtype=arr_dt, sycl_queue=q
+    )
+    py_ones = (
+        bool(scalar_val),
+        int(scalar_val),
+        float(scalar_val),
+        complex(scalar_val),
+        np.float32(scalar_val),
+        ctypes.c_int(scalar_val),
+    )
+    for sc in py_ones:
+        R = dpt.logical_xor(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        E = np.logical_xor(dpt.asnumpy(X), sc)
+        assert (dpt.asnumpy(R) == E).all()
+
+        R = dpt.logical_xor(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+        E = np.logical_xor(sc, dpt.asnumpy(X))
+        assert (dpt.asnumpy(R) == E).all()
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_logical_xor_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.logical_xor(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_logical_xor_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.logical_xor(a, c)
diff --git a/dpnp/tests/tensor/elementwise/test_maximum_minimum.py b/dpnp/tests/tensor/elementwise/test_maximum_minimum.py
new file mode 100644
index 000000000000..2eb6d9de7582
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_maximum_minimum.py
@@ -0,0 +1,329 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+import itertools
+
+import dpctl
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_maximum_minimum_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1_np = np.arange(sz)
+    np.random.shuffle(ar1_np)
+    ar1 = dpt.asarray(ar1_np, dtype=op1_dtype)
+    ar2_np = np.arange(sz)
+    np.random.shuffle(ar2_np)
+    ar2 = dpt.asarray(ar2_np, dtype=op2_dtype)
+
+    r = dpt.maximum(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.maximum(ar1_np.astype(op1_dtype), ar2_np.astype(op2_dtype))
+
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r = dpt.minimum(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.minimum(ar1_np.astype(op1_dtype), ar2_np.astype(op2_dtype))
+
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3_np = np.arange(sz)
+    np.random.shuffle(ar3_np)
+    ar3 = dpt.asarray(ar3_np, dtype=op1_dtype)
+    ar4_np = np.arange(2 * sz)
+    np.random.shuffle(ar4_np)
+    ar4 = dpt.asarray(ar4_np, dtype=op2_dtype)
+
+    r = dpt.maximum(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.maximum(
+        ar3_np[::-1].astype(op1_dtype), ar4_np[::2].astype(op2_dtype)
+    )
+
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r = dpt.minimum(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.minimum(
+        ar3_np[::-1].astype(op1_dtype), ar4_np[::2].astype(op2_dtype)
+    )
+
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected).all()
+
+
+@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
+def test_maximum_minimum_complex_matrix(op_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op_dtype, q)
+
+    sz = 127
+    ar1_np_real = np.random.randint(0, 10, sz)
+    ar1_np_imag = np.random.randint(0, 10, sz)
+    ar1 = dpt.asarray(ar1_np_real + 1j * ar1_np_imag, dtype=op_dtype)
+
+    ar2_np_real = np.random.randint(0, 10, sz)
+    ar2_np_imag = np.random.randint(0, 10, sz)
+    ar2 = dpt.asarray(ar2_np_real + 1j * ar2_np_imag, dtype=op_dtype)
+
+    r = dpt.maximum(ar1, ar2)
+    expected = np.maximum(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert_array_equal(dpt.asnumpy(r), expected)
+
+    r1 = dpt.maximum(ar1[::-2], ar2[::2])
+    expected1 = np.maximum(dpt.asnumpy(ar1[::-2]), dpt.asnumpy(ar2[::2]))
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert_array_equal(dpt.asnumpy(r1), expected1)
+
+    r = dpt.minimum(ar1, ar2)
+    expected = np.minimum(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == expected.shape
+    assert_array_equal(dpt.asnumpy(r), expected)
+
+    r1 = dpt.minimum(ar1[::-2], ar2[::2])
+    expected1 = np.minimum(dpt.asnumpy(ar1[::-2]), dpt.asnumpy(ar2[::2]))
+    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
+    assert r1.shape == expected1.shape
+    assert_array_equal(dpt.asnumpy(r1), expected1)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_maximum_minimum_real_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, np.inf, -np.inf, 5.0, -3.0]
+    x = list(itertools.product(x, repeat=2))
+    Xnp = np.array([tup[0] for tup in x], dtype=dtype)
+    Ynp = np.array([tup[1] for tup in x], dtype=dtype)
+    X = dpt.asarray(Xnp, dtype=dtype)
+    Y = dpt.asarray(Ynp, dtype=dtype)
+
+    R = dpt.maximum(X, Y)
+    Rnp = np.maximum(Xnp, Ynp)
+    assert_array_equal(dpt.asnumpy(R), Rnp)
+
+    R = dpt.minimum(X, Y)
+    Rnp = np.minimum(Xnp, Ynp)
+    assert_array_equal(dpt.asnumpy(R), Rnp)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_maximum_minimum_complex_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, -np.inf, -np.inf, +2.0, -1.0]
+    x = [complex(*val) for val in itertools.product(x, repeat=2)]
+    x = list(itertools.product(x, repeat=2))
+
+    Xnp = np.array([tup[0] for tup in x], dtype=dtype)
+    Ynp = np.array([tup[1] for tup in x], dtype=dtype)
+    X = dpt.asarray(Xnp, dtype=dtype, sycl_queue=q)
+    Y = dpt.asarray(Ynp, dtype=dtype, sycl_queue=q)
+
+    R = dpt.maximum(X, Y)
+    Rnp = np.maximum(Xnp, Ynp)
+    assert_array_equal(dpt.asnumpy(dpt.real(R)), np.real(Rnp))
+    assert_array_equal(dpt.asnumpy(dpt.imag(R)), np.imag(Rnp))
+
+    R = dpt.minimum(X, Y)
+    Rnp = np.minimum(Xnp, Ynp)
+    assert_array_equal(dpt.asnumpy(dpt.real(R)), np.real(Rnp))
+    assert_array_equal(dpt.asnumpy(dpt.imag(R)), np.imag(Rnp))
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_maximum_minimum_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1_np = np.arange(sz, dtype="i4")
+    np.random.shuffle(ar1_np)
+    ar1 = dpt.asarray(ar1_np, usm_type=op1_usm_type)
+    ar2_np = np.arange(sz, dtype="i4")
+    np.random.shuffle(ar2_np)
+    ar2 = dpt.asarray(ar2_np, usm_type=op2_usm_type)
+
+    r = dpt.maximum(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+    r = dpt.minimum(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_maximum_minimum_order():
+    get_queue_or_skip()
+
+    ar1_np = np.arange(20 * 20, dtype="i4").reshape(20, 20)
+    np.random.shuffle(ar1_np)
+    ar1 = dpt.asarray(ar1_np, order="C")
+    ar2_np = np.arange(20 * 20, dtype="i4").reshape(20, 20)
+    np.random.shuffle(ar2_np)
+    ar2 = dpt.asarray(ar2_np, order="C")
+
+    r1 = dpt.maximum(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.maximum(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.maximum(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.maximum(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.asarray(ar1_np, order="F")
+    ar2 = dpt.asarray(ar2_np, order="F")
+    r1 = dpt.maximum(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.maximum(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.maximum(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.maximum(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1_np = np.arange(40 * 40, dtype="i4").reshape(40, 40)
+    np.random.shuffle(ar1_np)
+    ar1 = dpt.asarray(ar1_np, order="C")[:20, ::-2]
+    ar2_np = np.arange(40 * 40, dtype="i4").reshape(40, 40)
+    np.random.shuffle(ar2_np)
+    ar2 = dpt.asarray(ar2_np, order="C")[:20, ::-2]
+    r4 = dpt.maximum(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.asarray(ar1_np, order="C")[:20, ::-2].mT
+    ar2 = dpt.asarray(ar2_np, order="C")[:20, ::-2].mT
+    r4 = dpt.maximum(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_maximum_minimum_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.maximum(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.maximum(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+        R = dpt.minimum(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.minimum(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_maximum_minimum_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.maximum(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+    r = dpt.minimum(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_maximum_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.maximum(a, c)
+
+    with pytest.raises(ValueError):
+        dpt.minimum(a, c)
diff --git a/dpnp/tests/tensor/elementwise/test_multiply.py b/dpnp/tests/tensor/elementwise/test_multiply.py
new file mode 100644
index 000000000000..33dbef03f347
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_multiply.py
@@ -0,0 +1,251 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_multiply_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.multiply(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.multiply(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.multiply(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.multiply(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_multiply_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.multiply(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_multiply_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.multiply(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.multiply(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.multiply(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.multiply(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.multiply(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.multiply(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.multiply(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.multiply(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.multiply(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.multiply(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_multiply_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.multiply(m, v)
+
+    expected = np.multiply(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.multiply(v, m)
+    expected2 = np.multiply(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_multiply_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.multiply(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.multiply(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+@pytest.mark.parametrize("sc", [bool(1), int(1), float(1), complex(1)])
+def test_multiply_python_scalar_gh1219(arr_dt, sc):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    Xnp = np.ones(4, dtype=arr_dt)
+
+    X = dpt.ones(4, dtype=arr_dt, sycl_queue=q)
+
+    R = dpt.multiply(X, sc)
+    Rnp = np.multiply(Xnp, sc)
+    assert _compare_dtypes(R.dtype, Rnp.dtype, sycl_queue=q)
+
+    # symmetric case
+    R = dpt.multiply(sc, X)
+    Rnp = np.multiply(sc, Xnp)
+    assert _compare_dtypes(R.dtype, Rnp.dtype, sycl_queue=q)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_multiply_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X *= int(1)
+    elif dt_kind == "f":
+        X *= float(1)
+    elif dt_kind == "c":
+        X *= complex(1)
+    elif dt_kind == "b":
+        X *= bool(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_multiply_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 *= ar2
+        assert (
+            dpt.asnumpy(ar1) == np.full(ar1.shape, 1, dtype=ar1.dtype)
+        ).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+        ar3[::-1] *= ar4[::2]
+        assert (
+            dpt.asnumpy(ar3) == np.full(ar3.shape, 1, dtype=ar3.dtype)
+        ).all()
+
+    else:
+        with pytest.raises(ValueError):
+            ar1 *= ar2
+
+
+def test_multiply_inplace_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    m *= v
+    assert (dpt.asnumpy(m) == np.arange(0, 5, dtype="i4")[np.newaxis, :]).all()
diff --git a/dpnp/tests/tensor/elementwise/test_negative.py b/dpnp/tests/tensor/elementwise/test_negative.py
new file mode 100644
index 000000000000..9713f0ecb364
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_negative.py
@@ -0,0 +1,101 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_negative_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
+    assert dpt.negative(X).dtype == arg_dt
+
+    r = dpt.empty_like(X, dtype=arg_dt)
+    dpt.negative(X, out=r)
+    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.negative(X)))
+
+
+def test_negative_bool():
+    get_queue_or_skip()
+    x = dpt.ones(64, dtype="?")
+    with pytest.raises(ValueError):
+        dpt.negative(x)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_negative_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("i4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    Y = dpt.negative(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.negative(dpt.asnumpy(X))
+    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_negative_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.negative(np.ones(U.shape, dtype=U.dtype))
+        expected_Y[..., 1::2] = 0
+        expected_Y = np.transpose(expected_Y, perms)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.negative(U, order=ord)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpnp/tests/tensor/elementwise/test_nextafter.py b/dpnp/tests/tensor/elementwise/test_nextafter.py
new file mode 100644
index 000000000000..b904bc42c6b7
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_nextafter.py
@@ -0,0 +1,169 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _no_complex_dtypes,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_nextafter_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.nextafter(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.nextafter(
+        np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.nextafter(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.nextafter(
+        np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
+def test_nextafter_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.nextafter(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.nextafter(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_nextafter_special_cases_nan(dt):
+    """If either x1_i or x2_i is NaN, the result is NaN."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([2.0, dpt.nan, dpt.nan], dtype=dt)
+    x2 = dpt.asarray([dpt.nan, 2.0, dpt.nan], dtype=dt)
+
+    y = dpt.nextafter(x1, x2)
+    assert dpt.all(dpt.isnan(y))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_nextafter_special_cases_zero(dt):
+    """If x1_i is equal to x2_i, the result is x2_i."""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x1 = dpt.asarray([-0.0, 0.0, -0.0, 0.0], dtype=dt)
+    x2 = dpt.asarray([0.0, -0.0, -0.0, 0.0], dtype=dt)
+
+    y = dpt.nextafter(x1, x2)
+    assert dpt.all(y == 0)
+
+    skip_checking_signs = (
+        x1.dtype == dpt.float16
+        and x1.sycl_device.backend == dpctl.backend_type.cuda
+    )
+    if skip_checking_signs:
+        pytest.skip(
+            "Skipped checking signs for nextafter due to "
+            "known issue in DPC++ support for CUDA devices"
+        )
+    else:
+        assert dpt.all(dpt.signbit(y) == dpt.signbit(x2))
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_nextafter_basic(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    s = 10
+    x1 = dpt.ones(s, dtype=dt, sycl_queue=q)
+    x2 = dpt.full(s, 2, dtype=dt, sycl_queue=q)
+
+    r = dpt.nextafter(x1, x2)
+    expected_diff = dpt.asarray(dpt.finfo(dt).eps, dtype=dt, sycl_queue=q)
+
+    assert dpt.all(r > 0)
+    assert dpt.all(r - x1 == expected_diff)
+
+    x3 = dpt.zeros(s, dtype=dt, sycl_queue=q)
+
+    r = dpt.nextafter(x3, x1)
+    assert dpt.all(r > 0)
+
+    r = dpt.nextafter(x1, x3)
+    assert dpt.all((r - x1) < 0)
+
+    r = dpt.nextafter(x1, 0)
+    assert dpt.all(x1 - r == (expected_diff) / 2)
+
+    r = dpt.nextafter(x3, dpt.inf)
+    assert dpt.all(r > 0)
+
+    r = dpt.nextafter(x3, -dpt.inf)
+    assert dpt.all(r < 0)
diff --git a/dpnp/tests/tensor/elementwise/test_not_equal.py b/dpnp/tests/tensor/elementwise/test_not_equal.py
new file mode 100644
index 000000000000..3f0eb58cf8b7
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_not_equal.py
@@ -0,0 +1,225 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes)
+@pytest.mark.parametrize("op2_dtype", _all_dtypes)
+def test_not_equal_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.not_equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.not_equal(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, False, dtype=r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.not_equal(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.not_equal(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, False, dtype=r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_not_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.not_equal(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_not_equal_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.not_equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.not_equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.not_equal(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.not_equal(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.not_equal(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.not_equal(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.not_equal(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.not_equal(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.not_equal(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.not_equal(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_not_equal_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    r = dpt.not_equal(m, v)
+    expected = np.full((100, 5), [True, False, True, True, True], dtype="?")
+
+    assert (dpt.asnumpy(r) == expected).all()
+
+    r2 = dpt.not_equal(v, m)
+    assert (dpt.asnumpy(r2) == expected).all()
+
+    r3 = dpt.empty_like(m, dtype="?")
+    dpt.not_equal(m, v, out=r3)
+    assert (dpt.asnumpy(r3) == expected).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_not_equal_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        R = dpt.not_equal(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        assert not dpt.all(R)
+        R = dpt.not_equal(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+        assert not dpt.all(R)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_not_equal_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.not_equal(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_not_equal_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.not_equal(a, c)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_not_equal_alignment(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n = 256
+    s = dpt.concat((dpt.zeros(n, dtype=dtype), dpt.zeros(n, dtype=dtype)))
+
+    mask = s[:-1] != s[1:]
+    (pos,) = dpt.nonzero(mask)
+    assert dpt.all(pos == n)
+
+    out_arr = dpt.zeros(2 * n, dtype=mask.dtype)
+    dpt.not_equal(s[:-1], s[1:], out=out_arr[1:])
+    (pos,) = dpt.nonzero(mask)
+    assert dpt.all(pos == (n + 1))
diff --git a/dpnp/tests/tensor/elementwise/test_positive.py b/dpnp/tests/tensor/elementwise/test_positive.py
new file mode 100644
index 000000000000..d4358e5827da
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_positive.py
@@ -0,0 +1,94 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_positive_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
+    assert dpt.positive(X).dtype == arg_dt
+
+    r = dpt.empty_like(X, dtype=arg_dt)
+    dpt.positive(X, out=r)
+    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.positive(X)))
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_positive_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("i4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    Y = dpt.positive(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = dpt.asnumpy(X)
+    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_positive_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.ones(U.shape, dtype=U.dtype)
+        expected_Y[..., 1::2] = 0
+        expected_Y = np.transpose(expected_Y, perms)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.positive(U, order=ord)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpnp/tests/tensor/elementwise/test_pow.py b/dpnp/tests/tensor/elementwise/test_pow.py
new file mode 100644
index 000000000000..c68e6ad13b0a
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_pow.py
@@ -0,0 +1,229 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
+def test_power_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.pow(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.power(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.pow(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.power(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_power_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.pow(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_pow_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.pow(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.pow(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.pow(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.pow(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.pow(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.pow(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.pow(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.pow(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.pow(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.pow(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_pow_broadcasting():
+    get_queue_or_skip()
+
+    v = dpt.arange(1, 6, dtype="i4")
+    m = dpt.full((100, 5), 2, dtype="i4")
+
+    r = dpt.pow(m, v)
+
+    expected = np.power(
+        np.full((100, 5), 2, dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.pow(v, m)
+    expected2 = np.power(
+        np.arange(1, 6, dtype="i4"), np.full((100, 5), 2, dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_pow_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        complex(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.pow(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.pow(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_pow_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X **= int(1)
+    elif dt_kind == "f":
+        X **= float(1)
+    elif dt_kind == "c":
+        X **= complex(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
+def test_pow_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 **= ar2
+        assert (
+            dpt.asnumpy(ar1) == np.full(ar1.shape, 1, dtype=ar1.dtype)
+        ).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+        ar3[::-1] *= ar4[::2]
+        assert (
+            dpt.asnumpy(ar3) == np.full(ar3.shape, 1, dtype=ar3.dtype)
+        ).all()
+
+    else:
+        with pytest.raises(ValueError):
+            ar1 **= ar2
+
+
+def test_pow_inplace_basic():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    expected = dpt.square(x)
+    x **= 2
+
+    assert dpt.all(x == expected)
diff --git a/dpnp/tests/tensor/elementwise/test_reciprocal.py b/dpnp/tests/tensor/elementwise/test_reciprocal.py
new file mode 100644
index 000000000000..dd31c3323f68
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_reciprocal.py
@@ -0,0 +1,108 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes, _complex_fp_dtypes
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_reciprocal_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    one = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    expected_dtype = dpt.divide(one, x).dtype
+    assert dpt.reciprocal(x).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_reciprocal_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    res = dpt.reciprocal(x)
+    expected = 1 / x
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_reciprocal_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    res = dpt.reciprocal(x)
+    expected = 1 / x
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol)
+
+
+def test_reciprocal_special_cases():
+    get_queue_or_skip()
+
+    x = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = dpt.reciprocal(x)
+    expected = dpt.asarray([dpt.nan, dpt.inf, -dpt.inf, 0.0, -0.0], dtype="f4")
+    assert dpt.allclose(res, expected, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
+def test_reciprocal_complex_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+    c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)]
+
+    z = dpt.asarray(c_, dtype=dtype)
+    r = dpt.reciprocal(z)
+
+    expected = 1 / z
+
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpnp/tests/tensor/elementwise/test_remainder.py b/dpnp/tests/tensor/elementwise/test_remainder.py
new file mode 100644
index 000000000000..b8d5ca1cf8ae
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_remainder.py
@@ -0,0 +1,277 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _compare_dtypes,
+    _no_complex_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
+def test_remainder_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.remainder(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.remainder(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.remainder(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.remainder(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_remainder_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.remainder(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_remainder_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.remainder(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.remainder(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.remainder(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.remainder(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.remainder(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.remainder(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.remainder(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.remainder(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.remainder(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.remainder(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+@pytest.mark.parametrize("dt", _no_complex_dtypes[1:8:2])
+def test_remainder_negative_integers(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.arange(-5, -1, 1, dtype=dt, sycl_queue=q)
+    x_np = np.arange(-5, -1, 1, dtype=dt)
+    val = 3
+
+    r1 = dpt.remainder(x, val)
+    expected = np.remainder(x_np, val)
+    assert (dpt.asnumpy(r1) == expected).all()
+
+    r2 = dpt.remainder(val, x)
+    expected = np.remainder(val, x_np)
+    assert (dpt.asnumpy(r2) == expected).all()
+
+
+def test_remainder_integer_zero():
+    get_queue_or_skip()
+
+    for dt in ["i4", "u4"]:
+        x = dpt.ones(1, dtype=dt)
+        y = dpt.zeros_like(x)
+
+        assert (dpt.asnumpy(dpt.remainder(x, y)) == np.zeros(1, dtype=dt)).all()
+
+        x = dpt.astype(x, dt)
+        y = dpt.zeros_like(x)
+
+        assert (dpt.asnumpy(dpt.remainder(x, y)) == np.zeros(1, dtype=dt)).all()
+
+
+@pytest.mark.parametrize("dt", _no_complex_dtypes[9:])
+def test_remainder_negative_floats(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.linspace(-5, 5, 20, dtype=dt, sycl_queue=q)
+    x_np = np.linspace(-5, 5, 20, dtype=dt)
+    val = 3
+
+    tol = 8 * dpt.finfo(dt).resolution
+
+    r1 = dpt.remainder(x, val)
+    expected = np.remainder(x_np, val)
+    with np.errstate(invalid="ignore"):
+        np.allclose(
+            dpt.asnumpy(r1), expected, rtol=tol, atol=tol, equal_nan=True
+        )
+
+    r2 = dpt.remainder(val, x)
+    expected = np.remainder(val, x_np)
+    with np.errstate(invalid="ignore"):
+        np.allclose(
+            dpt.asnumpy(r2), expected, rtol=tol, atol=tol, equal_nan=True
+        )
+
+
+def test_remainder_special_cases():
+    get_queue_or_skip()
+
+    lhs = [dpt.nan, dpt.inf, 0.0, -0.0, -0.0, 1.0, dpt.inf, -dpt.inf]
+    rhs = [dpt.nan, dpt.inf, -0.0, 1.0, 1.0, 0.0, 1.0, -1.0]
+
+    x, y = dpt.asarray(lhs, dtype="f4"), dpt.asarray(rhs, dtype="f4")
+
+    x_np, y_np = np.asarray(lhs, dtype="f4"), np.asarray(rhs, dtype="f4")
+
+    res = dpt.remainder(x, y)
+
+    with np.errstate(invalid="ignore"):
+        np.allclose(dpt.asnumpy(res), np.remainder(x_np, y_np))
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes)
+def test_remainder_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.remainder(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.remainder(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:])
+def test_remainder_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X %= int(1)
+    elif dt_kind == "f":
+        X %= float(1)
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_remainder_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 %= ar2
+        assert dpt.all(ar1 == dpt.zeros(ar1.shape, dtype=ar1.dtype))
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+        ar3[::-1] %= ar4[::2]
+        assert dpt.all(ar3 == dpt.zeros(ar3.shape, dtype=ar3.dtype))
+
+    else:
+        with pytest.raises(ValueError):
+            ar1 %= ar2
+
+
+def test_remainder_inplace_basic():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    expected = x & 1
+    x %= 2
+
+    assert dpt.all(x == expected)
diff --git a/dpnp/tests/tensor/elementwise/test_round.py b/dpnp/tests/tensor/elementwise/test_round.py
new file mode 100644
index 000000000000..5cfcb6dd598e
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_round.py
@@ -0,0 +1,234 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_round_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0.1, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.round(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.round(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_round_real_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    Xnp = np.linspace(0.01, 88.1, num=n_seq, dtype=dtype)
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt.round(X)
+    Ynp = np.round(Xnp)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(Y), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt.round(X, out=Z)
+
+    assert_allclose(dpt.asnumpy(Z), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_round_complex_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    low = -88.0
+    high = 88.0
+    x1 = np.random.uniform(low=low, high=high, size=n_seq)
+    x2 = np.random.uniform(low=low, high=high, size=n_seq)
+    Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
+
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt.round(X)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(Y), np.repeat(np.round(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt.round(X, out=Z)
+
+    assert_allclose(
+        dpt.asnumpy(Z), np.repeat(np.round(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_round_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 16.2
+    X[..., 1::2] = 23.7
+
+    Y = dpt.round(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.round(np.float32(16.2))
+    expected_Y[..., 1::2] = np.round(np.float32(23.7))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_round_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 8.8
+    X[..., 1::2] = 11.3
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.round(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.round(U, order=ord)
+            assert_allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_round_real_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    x = [np.nan, np.inf, -np.inf, 1.5, 2.5, -1.5, -2.5, 0.0, -0.0]
+    Xnp = np.array(x, dtype=dtype)
+    X = dpt.asarray(x, dtype=dtype)
+
+    Y = dpt.asnumpy(dpt.round(X))
+    Ynp = np.round(Xnp)
+    assert_allclose(Y, Ynp, atol=tol, rtol=tol)
+    assert_array_equal(np.signbit(Y), np.signbit(Ynp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_round_real_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    for ii in sizes:
+        Xnp = np.random.uniform(low=0.01, high=88.1, size=ii)
+        Xnp.astype(dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np.round(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt.round(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_round_complex_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    low = -88.0
+    high = 88.0
+    for ii in sizes:
+        x1 = np.random.uniform(low=low, high=high, size=ii)
+        x2 = np.random.uniform(low=low, high=high, size=ii)
+        Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np.round(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt.round(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_round_complex_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, np.inf, -np.inf, 1.5, 2.5, -1.5, -2.5, 0.0, -0.0]
+    xc = [complex(*val) for val in itertools.product(x, repeat=2)]
+
+    Xc_np = np.array(xc, dtype=dtype)
+    Xc = dpt.asarray(Xc_np, dtype=dtype, sycl_queue=q)
+
+    Ynp = np.round(Xc_np)
+    Y = dpt.round(Xc)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.real(Y)), np.real(Ynp), atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(dpt.imag(Y)), np.imag(Ynp), atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_rsqrt.py b/dpnp/tests/tensor/elementwise/test_rsqrt.py
new file mode 100644
index 000000000000..559de121e9be
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_rsqrt.py
@@ -0,0 +1,93 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _map_to_device_dtype,
+    _no_complex_dtypes,
+    _real_fp_dtypes,
+)
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_rsqrt_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.reciprocal(np.sqrt(np.array(1, dtype=dtype))).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.rsqrt(x).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_rsqrt_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    res = dpt.rsqrt(x)
+    expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype))
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_rsqrt_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    res = dpt.rsqrt(x)
+    expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype))
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol)
+
+
+def test_rsqrt_special_cases():
+    get_queue_or_skip()
+
+    x = dpt.asarray([dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = dpt.rsqrt(x)
+    expected = dpt.asarray(
+        [dpt.nan, dpt.nan, dpt.inf, -dpt.inf, 0.0, dpt.nan], dtype="f4"
+    )
+    assert dpt.allclose(res, expected, equal_nan=True)
diff --git a/dpnp/tests/tensor/elementwise/test_sign.py b/dpnp/tests/tensor/elementwise/test_sign.py
new file mode 100644
index 000000000000..e2addb23b711
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_sign.py
@@ -0,0 +1,140 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _no_complex_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_sign_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
+    assert dpt.sign(X).dtype == arg_dt
+
+    r = dpt.empty_like(X, dtype=arg_dt)
+    dpt.sign(X, out=r)
+    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.sign(X)))
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_sign_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("i4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    Y = dpt.sign(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = dpt.asnumpy(X)
+    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_sign_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    expected_dt = np.sign(np.ones(tuple(), dtype=arg_dt)).dtype
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.ones(U.shape, dtype=expected_dt)
+        expected_Y[..., 1::2] = 0
+        expected_Y = np.transpose(expected_Y, perms)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.sign(U, order=ord)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_sign_complex(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    Xnp = np.random.standard_normal(
+        size=input_shape
+    ) + 1j * np.random.standard_normal(size=input_shape)
+    Xnp = Xnp.astype(arg_dt)
+    X[...] = Xnp
+
+    for ord in ["C", "F", "A", "K"]:
+        for perms in itertools.permutations(range(4)):
+            U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+            Y = dpt.sign(U, order=ord)
+            X_t = np.transpose(Xnp[:, ::-1, ::-1, :], perms)
+            expected_Y = X_t / np.abs(X_t)
+            tol = dpt.finfo(Y.dtype).resolution
+            np.testing.assert_allclose(
+                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
+            )
+
+
+# test for all signed real data types
+@pytest.mark.parametrize(
+    "dt", _no_complex_dtypes[1:8:2] + _no_complex_dtypes[9:]
+)
+def test_sign_negative(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.arange(-20, 20, 1, dtype=dt, sycl_queue=q)
+    x_np = np.arange(-20, 20, 1, dtype=dt)
+    res = dpt.sign(x)
+
+    assert (dpt.asnumpy(res) == np.sign(x_np)).all()
diff --git a/dpnp/tests/tensor/elementwise/test_signbit.py b/dpnp/tests/tensor/elementwise/test_signbit.py
new file mode 100644
index 000000000000..9006bcafbd2d
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_signbit.py
@@ -0,0 +1,124 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_signbit_out_type_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    x = dpt.linspace(1, 10, num=256, dtype=arg_dt)
+    sb = dpt.signbit(x)
+    assert sb.dtype == dpt.bool
+
+    assert not dpt.any(sb)
+
+    x2 = dpt.linspace(-10, -1, num=256, dtype=arg_dt)
+    sb2 = dpt.signbit(x2)
+    assert dpt.all(sb2)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_signbit_out_type_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    x = dpt.linspace(1, 10, num=256, dtype=arg_dt)
+    sb = dpt.signbit(x[::-3])
+    assert sb.dtype == dpt.bool
+
+    assert not dpt.any(sb)
+
+    x2 = dpt.linspace(-10, -1, num=256, dtype=arg_dt)
+    sb2 = dpt.signbit(x2[::-3])
+    assert dpt.all(sb2)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_signbit_special_cases_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    n = 63
+    x1 = dpt.full(n, -dpt.inf, dtype=arg_dt)
+    x2 = dpt.full(n, -0.0, dtype=arg_dt)
+    x3 = dpt.full(n, 0.0, dtype=arg_dt)
+    x4 = dpt.full(n, dpt.inf, dtype=arg_dt)
+
+    x = dpt.concat((x1, x2, x3, x4))
+    actual = dpt.signbit(x)
+
+    expected = dpt.concat(
+        (
+            dpt.full(x1.size, True),
+            dpt.full(x2.size, True),
+            dpt.full(x3.size, False),
+            dpt.full(x4.size, False),
+        )
+    )
+
+    assert dpt.all(dpt.equal(actual, expected))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_signbit_special_cases_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    x1 = dpt.full(63, -dpt.inf, dtype=arg_dt)
+    x2 = dpt.full(63, -0.0, dtype=arg_dt)
+    x3 = dpt.full(63, 0.0, dtype=arg_dt)
+    x4 = dpt.full(63, dpt.inf, dtype=arg_dt)
+
+    x = dpt.concat((x1, x2, x3, x4))
+    actual = dpt.signbit(x[::-1])
+
+    expected = dpt.concat(
+        (
+            dpt.full(x4.size, False),
+            dpt.full(x3.size, False),
+            dpt.full(x2.size, True),
+            dpt.full(x1.size, True),
+        )
+    )
+
+    assert dpt.all(dpt.equal(actual, expected))
diff --git a/dpnp/tests/tensor/elementwise/test_sqrt.py b/dpnp/tests/tensor/elementwise/test_sqrt.py
new file mode 100644
index 000000000000..d6bc7a42434e
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_sqrt.py
@@ -0,0 +1,207 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+import warnings
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose, assert_equal
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _complex_fp_dtypes,
+    _map_to_device_dtype,
+    _real_fp_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_sqrt_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.sqrt(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.sqrt(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_sqrt_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.sqrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_sqrt_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.sqrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_sqrt_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 16.0
+    X[..., 1::2] = 23.0
+
+    Y = dpt.sqrt(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.sqrt(np.float32(16.0))
+    expected_Y[..., 1::2] = np.sqrt(np.float32(23.0))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_sqrt_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 16.0
+    X[..., 1::2] = 23.0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.sqrt(dpt.asnumpy(U))
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.sqrt(U, order=ord)
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
+def test_sqrt_special_cases():
+    q = get_queue_or_skip()
+
+    X = dpt.asarray(
+        [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q
+    )
+    Xnp = dpt.asnumpy(X)
+
+    assert_equal(dpt.asnumpy(dpt.sqrt(X)), np.sqrt(Xnp))
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_sqrt_real_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+
+    x = dpt.asarray(inps_, dtype=dtype)
+    r = dpt.sqrt(x)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.sqrt(np.asarray(inps_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
+def test_sqrt_complex_fp_special_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    nans_ = [dpt.nan, -dpt.nan]
+    infs_ = [dpt.inf, -dpt.inf]
+    finites_ = [-1.0, -0.0, 0.0, 1.0]
+    inps_ = nans_ + infs_ + finites_
+    c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)]
+
+    z = dpt.asarray(c_, dtype=dtype)
+    r = dpt.sqrt(z)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        expected_np = np.sqrt(np.asarray(c_, dtype=dtype))
+
+    expected = dpt.asarray(expected_np, dtype=dtype)
+    tol = dpt.finfo(r.dtype).resolution
+
+    if not dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True):
+        for i in range(r.shape[0]):
+            failure_data = []
+            if not dpt.allclose(
+                r[i], expected[i], atol=tol, rtol=tol, equal_nan=True
+            ):
+                msg = (
+                    f"Test failed for input {z[i]}, i.e. {c_[i]} for index {i}"
+                )
+                msg += f", results were {r[i]} vs. {expected[i]}"
+                failure_data.extend(msg)
+        pytest.skip(reason=msg)
diff --git a/dpnp/tests/tensor/elementwise/test_square.py b/dpnp/tests/tensor/elementwise/test_square.py
new file mode 100644
index 000000000000..0b65e9af53ce
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_square.py
@@ -0,0 +1,114 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import _all_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_square_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    X = dpt.arange(5, dtype=arg_dt, sycl_queue=q)
+    assert dpt.square(X).dtype == arg_dt
+
+    r = dpt.empty_like(X, dtype=arg_dt)
+    dpt.square(X, out=r)
+    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.square(X)))
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_square_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("i4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1
+    X[..., 1::2] = 0
+
+    Y = dpt.square(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = dpt.asnumpy(X)
+    assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_square_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 2
+    X[..., 1::2] = 0
+
+    for perms in itertools.permutations(range(4)):
+        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+        expected_Y = np.full(U.shape, 4, dtype=U.dtype)
+        expected_Y[..., 1::2] = 0
+        expected_Y = np.transpose(expected_Y, perms)
+        for ord in ["C", "F", "A", "K"]:
+            Y = dpt.square(U, order=ord)
+            assert np.allclose(dpt.asnumpy(Y), expected_Y)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_square_special_cases(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    vals = [np.nan, np.inf, -np.inf, 0.0, -0.0]
+    X = dpt.asarray(vals, dtype=dtype, sycl_queue=q)
+    X_np = dpt.asnumpy(X)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    with np.errstate(all="ignore"):
+        assert np.allclose(
+            dpt.asnumpy(dpt.square(X)),
+            np.square(X_np),
+            atol=tol,
+            rtol=tol,
+            equal_nan=True,
+        )
diff --git a/dpnp/tests/tensor/elementwise/test_subtract.py b/dpnp/tests/tensor/elementwise/test_subtract.py
new file mode 100644
index 000000000000..70d05f926c23
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_subtract.py
@@ -0,0 +1,252 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _can_cast
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _compare_dtypes,
+    _usm_types,
+)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
+def test_subtract_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.subtract(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.subtract(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, 0, dtype=r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    r2 = dpt.empty_like(ar1, dtype=r.dtype)
+    dpt.subtract(ar1, ar2, out=r2)
+    assert (dpt.asnumpy(r2) == np.full(r2.shape, 0, dtype=r2.dtype)).all()
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.subtract(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_dtype = np.subtract(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    ).dtype
+    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == np.full(r.shape, 0, dtype=r.dtype)).all()
+
+    r2 = dpt.empty_like(ar1, dtype=r.dtype)
+    dpt.subtract(ar3[::-1], ar4[::2], out=r2)
+    assert (dpt.asnumpy(r2) == np.full(r2.shape, 0, dtype=r2.dtype)).all()
+
+
+def test_subtract_bool():
+    get_queue_or_skip()
+    ar1 = dpt.ones(127, dtype="?")
+    ar2 = dpt.ones_like(ar1, dtype="?")
+    with pytest.raises(ValueError):
+        dpt.subtract(ar1, ar2)
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_subtract_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.subtract(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((op1_usm_type, op2_usm_type))
+    assert r.usm_type == expected_usm_type
+
+
+def test_subtract_order():
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="C")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="C")
+        r1 = dpt.subtract(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.subtract(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.subtract(ar1, ar2, order="A")
+        assert r3.flags.c_contiguous
+        r4 = dpt.subtract(ar1, ar2, order="K")
+        assert r4.flags.c_contiguous
+
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="F")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="F")
+        r1 = dpt.subtract(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.subtract(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.subtract(ar1, ar2, order="A")
+        assert r3.flags.f_contiguous
+        r4 = dpt.subtract(ar1, ar2, order="K")
+        assert r4.flags.f_contiguous
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
+        r4 = dpt.subtract(ar1, ar2, order="K")
+        assert r4.strides == (n, -1)
+        r5 = dpt.subtract(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
+        r4 = dpt.subtract(ar1, ar2, order="K")
+        assert r4.strides == (-1, n)
+        r5 = dpt.subtract(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+
+def test_subtract_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    r = dpt.subtract(m, v)
+    assert (
+        dpt.asnumpy(r) == np.arange(1, -4, step=-1, dtype="i4")[np.newaxis, :]
+    ).all()
+
+    r2 = dpt.subtract(v, m)
+    assert (
+        dpt.asnumpy(r2) == np.arange(-1, 4, dtype="i4")[np.newaxis, :]
+    ).all()
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes[1:])
+def test_subtract_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        R = dpt.subtract(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.subtract(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_subtract_inplace_python_scalar(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
+    dt_kind = X.dtype.kind
+    if dt_kind in "ui":
+        X -= int(0)
+    elif dt_kind == "f":
+        X -= float(0)
+    elif dt_kind == "c":
+        X -= complex(0)
+
+
+@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
+def test_subtract_inplace_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
+        ar1 -= ar2
+        assert (dpt.asnumpy(ar1) == np.zeros(ar1.shape, dtype=ar1.dtype)).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+        ar3[::-1] -= ar4[::2]
+        assert (dpt.asnumpy(ar3) == np.zeros(ar3.shape, dtype=ar3.dtype)).all()
+
+    else:
+        with pytest.raises(ValueError):
+            ar1 -= ar2
+
+
+def test_subtract_inplace_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(5, dtype="i4")
+
+    m -= v
+    assert (
+        dpt.asnumpy(m) == np.arange(1, -4, step=-1, dtype="i4")[np.newaxis, :]
+    ).all()
diff --git a/dpnp/tests/tensor/elementwise/test_trigonometric.py b/dpnp/tests/tensor/elementwise/test_trigonometric.py
new file mode 100644
index 000000000000..497432360306
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_trigonometric.py
@@ -0,0 +1,234 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+
+from ..helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+)
+
+_trig_funcs = [(np.sin, dpt.sin), (np.cos, dpt.cos), (np.tan, dpt.tan)]
+_inv_trig_funcs = [
+    (np.arcsin, dpt.asin),
+    (np.arccos, dpt.acos),
+    (np.arctan, dpt.atan),
+]
+_all_funcs = _trig_funcs + _inv_trig_funcs
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_trig_out_type(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np_call(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt_call(x).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_trig_real_contig(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 100
+    n_rep = 137
+    if np_call in _trig_funcs:
+        Xnp = np.linspace(
+            -np.pi / 2 * 0.99, np.pi / 2 * 0.99, num=n_seq, dtype=dtype
+        )
+    if np_call == np.arctan:
+        Xnp = np.linspace(-100.0, 100.0, num=n_seq, dtype=dtype)
+    else:
+        Xnp = np.linspace(-1.0, 1.0, num=n_seq, dtype=dtype)
+
+    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
+    Y = dpt_call(X)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(Y), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt_call(X, out=Z)
+
+    assert_allclose(
+        dpt.asnumpy(Z), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_trig_complex_contig(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 256
+    n_rep = 137
+    low = -9.0
+    high = 9.0
+    x1 = np.random.uniform(low=low, high=high, size=n_seq)
+    x2 = np.random.uniform(low=low, high=high, size=n_seq)
+    Xnp = x1 + 1j * x2
+
+    # stay away from poles and branch lines
+    modulus = np.abs(Xnp)
+    sel = np.logical_or(
+        modulus < 0.9,
+        np.logical_and(
+            modulus > 1.2, np.minimum(np.abs(x2), np.abs(x1)) > 0.05
+        ),
+    )
+    Xnp = Xnp[sel]
+
+    X = dpt.repeat(dpt.asarray(Xnp, dtype=dtype, sycl_queue=q), n_rep)
+    Y = dpt_call(X)
+
+    expected = np.repeat(np_call(Xnp.astype(dtype)), n_rep)
+
+    tol = 50 * dpt.finfo(dtype).resolution
+    assert_allclose(dpt.asnumpy(Y), expected, atol=tol, rtol=tol)
+
+    Z = dpt.empty_like(X, dtype=dtype)
+    dpt_call(X, out=Z)
+
+    assert_allclose(dpt.asnumpy(Z), expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_trig_real_strided(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 3, 4, 6, 8, 9, 24, 50, 72]
+    tol = 8 * dpt.finfo(dtype).resolution
+
+    low = -100.0
+    high = 100.0
+    if np_call in [np.arccos, np.arcsin]:
+        low = -1.0
+        high = 1.0
+    elif np_call in [np.tan]:
+        low = -np.pi / 2 * (0.99)
+        high = np.pi / 2 * (0.99)
+
+    for ii in sizes:
+        Xnp = np.random.uniform(low=low, high=high, size=ii)
+        Xnp.astype(dtype)
+        X = dpt.asarray(Xnp)
+        Ynp = np_call(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt_call(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_trig_complex_strided(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    np.random.seed(42)
+    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
+    sizes = [2, 4, 6, 8, 9, 24, 72]
+    tol = 50 * dpt.finfo(dtype).resolution
+
+    low = -9.0
+    high = 9.0
+    while True:
+        x1 = np.random.uniform(low=low, high=high, size=2 * sum(sizes))
+        x2 = np.random.uniform(low=low, high=high, size=2 * sum(sizes))
+        Xnp_all = np.array(
+            [complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype
+        )
+
+        # stay away from poles and branch lines
+        modulus = np.abs(Xnp_all)
+        sel = np.logical_or(
+            modulus < 0.9,
+            np.logical_and(
+                modulus > 1.2, np.minimum(np.abs(x2), np.abs(x1)) > 0.05
+            ),
+        )
+        Xnp_all = Xnp_all[sel]
+        if Xnp_all.size > sum(sizes):
+            break
+
+    pos = 0
+    for ii in sizes:
+        pos = pos + ii
+        Xnp = Xnp_all[:pos]
+        Xnp = Xnp[-ii:]
+        X = dpt.asarray(Xnp)
+        Ynp = np_call(Xnp)
+        for jj in strides:
+            assert_allclose(
+                dpt.asnumpy(dpt_call(X[::jj])),
+                Ynp[::jj],
+                atol=tol,
+                rtol=tol,
+            )
+
+
+@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_trig_real_special_cases(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = [np.nan, np.inf, -np.inf, 2.0, -2.0, +0.0, -0.0, +1.0, -1.0]
+
+    xf = np.array(x, dtype=dtype)
+    yf = dpt.asarray(xf, dtype=dtype, sycl_queue=q)
+
+    with np.errstate(all="ignore"):
+        Y_np = np_call(xf)
+
+    tol = 8 * dpt.finfo(dtype).resolution
+    Y = dpt_call(yf)
+    assert_allclose(dpt.asnumpy(Y), Y_np, atol=tol, rtol=tol)
diff --git a/dpnp/tests/tensor/elementwise/test_type_utils.py b/dpnp/tests/tensor/elementwise/test_type_utils.py
new file mode 100644
index 000000000000..42e096f4f42d
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/test_type_utils.py
@@ -0,0 +1,254 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+import dpnp.tensor._type_utils as tu
+
+from .utils import (
+    _all_dtypes,
+    _map_to_device_dtype,
+)
+
+
+class MockDevice:
+    def __init__(self, fp16: bool, fp64: bool):
+        self.has_aspect_fp16 = fp16
+        self.has_aspect_fp64 = fp64
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_type_utils_map_to_device_type(dtype):
+    for fp64 in [
+        True,
+        False,
+    ]:
+        for fp16 in [True, False]:
+            dev = MockDevice(fp16, fp64)
+            dt_in = dpt.dtype(dtype)
+            dt_out = _map_to_device_dtype(dt_in, dev)
+            assert isinstance(dt_out, dpt.dtype)
+
+
+def test_type_util_all_data_types():
+    for fp64 in [
+        True,
+        False,
+    ]:
+        for fp16 in [True, False]:
+            r = tu._all_data_types(fp16, fp64)
+            assert isinstance(r, list)
+            # 11: bool + 4 signed + 4 unsigned inegral + float32 + complex64
+            assert len(r) == 11 + int(fp16) + 2 * int(fp64)
+
+
+def test_type_util_can_cast():
+    for fp64 in [
+        True,
+        False,
+    ]:
+        for fp16 in [True, False]:
+            for from_ in _all_dtypes:
+                for to_ in _all_dtypes:
+                    r = tu._can_cast(
+                        dpt.dtype(from_), dpt.dtype(to_), fp16, fp64
+                    )
+                    assert isinstance(r, bool)
+
+
+def test_type_utils_find_buf_dtype():
+    def _denier_fn(dt):
+        return False
+
+    for fp64 in [
+        True,
+        False,
+    ]:
+        for fp16 in [True, False]:
+            dev = MockDevice(fp16, fp64)
+            arg_dt = dpt.float64
+            r = tu._find_buf_dtype(
+                arg_dt, _denier_fn, dev, tu._acceptance_fn_default_unary
+            )
+            assert r == (
+                None,
+                None,
+            )
+
+
+def test_type_utils_get_device_default_type():
+    with pytest.raises(RuntimeError):
+        tu._get_device_default_dtype("-", MockDevice(True, True))
+    try:
+        dev = dpctl.SyclDevice()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    for k in ["b", "i", "u", "f", "c"]:
+        dt = tu._get_device_default_dtype(k, dev)
+        assert isinstance(dt, dpt.dtype)
+        assert dt.kind == k
+
+
+def test_type_utils_find_buf_dtype2():
+    def _denier_fn(dt1, dt2):
+        return False
+
+    for fp64 in [
+        True,
+        False,
+    ]:
+        for fp16 in [True, False]:
+            dev = MockDevice(fp16, fp64)
+            arg1_dt = dpt.float64
+            arg2_dt = dpt.complex64
+            r = tu._find_buf_dtype2(
+                arg1_dt,
+                arg2_dt,
+                _denier_fn,
+                dev,
+                tu._acceptance_fn_default_binary,
+            )
+            assert r == (
+                None,
+                None,
+                None,
+            )
+
+
+def test_unary_func_arg_validation():
+    with pytest.raises(TypeError):
+        dpt.abs([1, 2, 3])
+    try:
+        a = dpt.arange(8)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    dpt.abs(a, order="invalid")
+
+
+def test_binary_func_arg_validation():
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.add([1, 2, 3], 1)
+    try:
+        a = dpt.arange(8)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        dpt.add(a, Ellipsis)
+    dpt.add(a, a, order="invalid")
+
+
+def test_all_data_types():
+    fp16_fp64_types = {dpt.float16, dpt.float64, dpt.complex128}
+    fp64_types = {dpt.float64, dpt.complex128}
+
+    all_dts = tu._all_data_types(True, True)
+    assert fp16_fp64_types.issubset(all_dts)
+
+    all_dts = tu._all_data_types(True, False)
+    assert dpt.float16 in all_dts
+    assert not fp64_types.issubset(all_dts)
+
+    all_dts = tu._all_data_types(False, True)
+    assert dpt.float16 not in all_dts
+    assert fp64_types.issubset(all_dts)
+
+    all_dts = tu._all_data_types(False, False)
+    assert not fp16_fp64_types.issubset(all_dts)
+
+
+@pytest.mark.parametrize("fp16", [True, False])
+@pytest.mark.parametrize("fp64", [True, False])
+def test_maximal_inexact_types(fp16, fp64):
+    assert not tu._is_maximal_inexact_type(dpt.int32, fp16, fp64)
+    assert fp64 == tu._is_maximal_inexact_type(dpt.float64, fp16, fp64)
+    assert fp64 == tu._is_maximal_inexact_type(dpt.complex128, fp16, fp64)
+    assert fp64 != tu._is_maximal_inexact_type(dpt.float32, fp16, fp64)
+    assert fp64 != tu._is_maximal_inexact_type(dpt.complex64, fp16, fp64)
+
+
+def test_can_cast_device():
+    assert tu._can_cast(dpt.int64, dpt.float64, True, True)
+    # if f8 is available, can't cast i8 to f4
+    assert not tu._can_cast(dpt.int64, dpt.float32, True, True)
+    assert not tu._can_cast(dpt.int64, dpt.float32, False, True)
+    # should be able to cast to f8 when f2 unavailable
+    assert tu._can_cast(dpt.int64, dpt.float64, False, True)
+    # casting to f4 acceptable when f8 unavailable
+    assert tu._can_cast(dpt.int64, dpt.float32, True, False)
+    assert tu._can_cast(dpt.int64, dpt.float32, False, False)
+    # can't safely cast inexact type to inexact type of lesser precision
+    assert not tu._can_cast(dpt.float32, dpt.float16, True, False)
+    assert not tu._can_cast(dpt.float64, dpt.float32, False, True)
+
+
+def test_acceptance_fns():
+    """Check type promotion acceptance functions"""
+    try:
+        dev = dpctl.SyclDevice()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("Default device is not available")
+    assert tu._acceptance_fn_reciprocal(
+        dpt.float32, dpt.float32, dpt.float32, dev
+    )
+    assert tu._acceptance_fn_negative(dpt.int8, dpt.int16, dpt.int16, dev)
+
+
+def test_weak_types():
+    wbt = tu.WeakBooleanType(True)
+    assert wbt.get()
+    assert tu._weak_type_num_kind(wbt) == 0
+
+    wit = tu.WeakIntegralType(7)
+    assert wit.get() == 7
+    assert tu._weak_type_num_kind(wit) == 1
+
+    wft = tu.WeakFloatingType(3.1415926)
+    assert wft.get() == 3.1415926
+    assert tu._weak_type_num_kind(wft) == 2
+
+    wct = tu.WeakComplexType(2.0 + 3.0j)
+    assert wct.get() == 2 + 3j
+    assert tu._weak_type_num_kind(wct) == 3
+
+
+def test_arg_validation():
+    with pytest.raises(TypeError):
+        tu._weak_type_num_kind(dict())
+
+    with pytest.raises(TypeError):
+        tu._strong_dtype_num_kind(Ellipsis)
+
+    with pytest.raises(ValueError):
+        tu._strong_dtype_num_kind(np.dtype("O"))
+
+    wt = tu.WeakFloatingType(2.0)
+    with pytest.raises(ValueError):
+        tu._resolve_weak_types(wt, wt, None)
diff --git a/dpnp/tests/tensor/elementwise/utils.py b/dpnp/tests/tensor/elementwise/utils.py
new file mode 100644
index 000000000000..6717ea577bd3
--- /dev/null
+++ b/dpnp/tests/tensor/elementwise/utils.py
@@ -0,0 +1,74 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+
+import dpnp.tensor._type_utils as tu
+
+_integral_dtypes = [
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+]
+_real_fp_dtypes = ["f2", "f4", "f8"]
+_complex_fp_dtypes = [
+    "c8",
+    "c16",
+]
+_real_value_dtypes = _integral_dtypes + _real_fp_dtypes
+_no_complex_dtypes = [
+    "b1",
+] + _real_value_dtypes
+_all_dtypes = _no_complex_dtypes + _complex_fp_dtypes
+
+_usm_types = ["device", "shared", "host"]
+
+
+def _map_to_device_dtype(dt, dev):
+    return tu._to_device_supported_dtype(dt, dev)
+
+
+def _compare_dtypes(dt, ref_dt, sycl_queue=None):
+    assert isinstance(sycl_queue, dpctl.SyclQueue)
+    dev = sycl_queue.sycl_device
+    expected_dt = _map_to_device_dtype(ref_dt, dev)
+    return dt == expected_dt
+
+
+__all__ = [
+    "_no_complex_dtypes",
+    "_all_dtypes",
+    "_usm_types",
+    "_map_to_device_dtype",
+    "_compare_dtypes",
+]
diff --git a/dpnp/tests/tensor/helper/__init__.py b/dpnp/tests/tensor/helper/__init__.py
new file mode 100644
index 000000000000..7fdb1fbe553b
--- /dev/null
+++ b/dpnp/tests/tensor/helper/__init__.py
@@ -0,0 +1,47 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+"""Helper module for tensor tests"""
+
+from ._helper import (
+    create_invalid_capsule,
+    get_queue_or_skip,
+    has_cpu,
+    has_gpu,
+    has_sycl_platforms,
+    skip_if_dtype_not_supported,
+)
+
+__all__ = [
+    "create_invalid_capsule",
+    "has_cpu",
+    "has_gpu",
+    "has_sycl_platforms",
+    "get_queue_or_skip",
+    "skip_if_dtype_not_supported",
+]
diff --git a/dpnp/tests/tensor/helper/_helper.py b/dpnp/tests/tensor/helper/_helper.py
new file mode 100644
index 000000000000..5d0b4825e953
--- /dev/null
+++ b/dpnp/tests/tensor/helper/_helper.py
@@ -0,0 +1,89 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import pytest
+
+
+def has_gpu(backend="opencl"):
+    return bool(dpctl.get_num_devices(backend=backend, device_type="gpu"))
+
+
+def has_cpu(backend="opencl"):
+    return bool(dpctl.get_num_devices(backend=backend, device_type="cpu"))
+
+
+def has_sycl_platforms():
+    return bool(len(dpctl.get_platforms()))
+
+
+def create_invalid_capsule():
+    """Creates an invalid capsule for the purpose of testing dpctl
+    constructors that accept capsules.
+    """
+    import ctypes
+
+    ctor = ctypes.pythonapi.PyCapsule_New
+    ctor.restype = ctypes.py_object
+    ctor.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
+    return ctor(id(ctor), b"invalid", 0)
+
+
+def get_queue_or_skip(args=()):
+    try:
+        q = dpctl.SyclQueue(*args)
+    except dpctl.SyclQueueCreationError:
+        pytest.skip(f"Queue could not be created from {args}")
+    return q
+
+
+def skip_if_dtype_not_supported(dt, q_or_dev):
+    import dpnp.tensor as dpt
+
+    dt = dpt.dtype(dt)
+    if type(q_or_dev) is dpctl.SyclQueue:
+        dev = q_or_dev.sycl_device
+    elif type(q_or_dev) is dpctl.SyclDevice:
+        dev = q_or_dev
+    else:
+        raise TypeError(
+            "Expected dpctl.SyclQueue or dpctl.SyclDevice, "
+            f"got {type(q_or_dev)}"
+        )
+    dev_has_dp = dev.has_aspect_fp64
+    if dev_has_dp is False and dt in [dpt.float64, dpt.complex128]:
+        pytest.skip(
+            f"{dev.name} does not support double precision floating point types"
+        )
+    dev_has_hp = dev.has_aspect_fp16
+    if dev_has_hp is False and dt in [
+        dpt.float16,
+    ]:
+        pytest.skip(
+            f"{dev.name} does not support half precision floating point type"
+        )
diff --git a/dpnp/tests/tensor/test_tensor_accumulation.py b/dpnp/tests/tensor/test_tensor_accumulation.py
new file mode 100644
index 000000000000..b7ea9147e100
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_accumulation.py
@@ -0,0 +1,449 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from random import randrange
+
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+sint_types = [
+    dpt.int8,
+    dpt.int16,
+    dpt.int32,
+    dpt.int64,
+]
+uint_types = [
+    dpt.uint8,
+    dpt.uint16,
+    dpt.uint32,
+    dpt.uint64,
+]
+rfp_types = [
+    dpt.float16,
+    dpt.float32,
+    dpt.float64,
+]
+cfp_types = [
+    dpt.complex64,
+    dpt.complex128,
+]
+
+no_complex_types = [dpt.bool] + sint_types + uint_types + rfp_types
+
+all_types = [dpt.bool] + sint_types + uint_types + rfp_types + cfp_types
+
+
+@pytest.mark.parametrize("dt", sint_types)
+def test_contig_cumsum_sint(dt):
+    get_queue_or_skip()
+    n = 10000
+    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), n)
+
+    res = dpt.cumulative_sum(x, dtype=dt)
+
+    ar = dpt.arange(n, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == expected)
+
+
+@pytest.mark.parametrize("dt", sint_types)
+def test_strided_cumsum_sint(dt):
+    get_queue_or_skip()
+    n = 10000
+    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), 2 * n)[1::2]
+
+    res = dpt.cumulative_sum(x, dtype=dt)
+
+    ar = dpt.arange(n, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == expected)
+
+    x2 = dpt.repeat(dpt.asarray([-1, 1], dtype=dt), 2 * n)[-1::-2]
+
+    res = dpt.cumulative_sum(x2, dtype=dt)
+
+    ar = dpt.arange(n, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == expected)
+
+
+@pytest.mark.parametrize("dt", sint_types)
+def test_contig_cumsum_axis_sint(dt):
+    get_queue_or_skip()
+    n0, n1 = 1000, 173
+    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), n0)
+    m = dpt.tile(dpt.expand_dims(x, axis=1), (1, n1))
+
+    res = dpt.cumulative_sum(m, dtype=dt, axis=0)
+
+    ar = dpt.arange(n0, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == dpt.expand_dims(expected, axis=1))
+
+
+@pytest.mark.parametrize("dt", sint_types)
+def test_strided_cumsum_axis_sint(dt):
+    get_queue_or_skip()
+    n0, n1 = 1000, 173
+    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), 2 * n0)
+    m = dpt.tile(dpt.expand_dims(x, axis=1), (1, n1))[1::2, ::-1]
+
+    res = dpt.cumulative_sum(m, dtype=dt, axis=0)
+
+    ar = dpt.arange(n0, dtype=dt)
+    expected = dpt.concat((1 + ar, dpt.flip(ar)))
+    assert dpt.all(res == dpt.expand_dims(expected, axis=1))
+
+
+def test_accumulate_scalar():
+    get_queue_or_skip()
+
+    s = dpt.asarray(1, dtype="i8")
+    r = dpt.cumulative_sum(s)
+    assert r == s
+    assert r.ndim == s.ndim
+
+    r = dpt.cumulative_sum(s, include_initial=True)
+    r_expected = dpt.asarray([0, 1], dtype="i8")
+    assert dpt.all(r == r_expected)
+
+
+def test_cumulative_sum_include_initial():
+    get_queue_or_skip()
+
+    n0, n1 = 3, 5
+    x = dpt.ones((n0, n1), dtype="i4")
+    r = dpt.cumulative_sum(x, axis=0, include_initial=True)
+    assert dpt.all(r[0, :] == 0)
+
+    r = dpt.cumulative_sum(x, axis=1, include_initial=True)
+    assert dpt.all(r[:, 0] == 0)
+
+    x = dpt.ones(n1, dtype="i4")
+    r = dpt.cumulative_sum(x, include_initial=True)
+    assert r.shape == (n1 + 1,)
+    assert r[0] == 0
+
+
+def test_cumulative_prod_identity():
+    get_queue_or_skip()
+
+    x = dpt.zeros(5, dtype="i4")
+    r = dpt.cumulative_prod(x, include_initial=True)
+    assert r[0] == 1
+
+
+def test_cumulative_logsumexp_identity():
+    get_queue_or_skip()
+
+    x = dpt.ones(5, dtype="f4")
+    r = dpt.cumulative_logsumexp(x, include_initial=True)
+    assert r[0] == -dpt.inf
+
+
+def test_accumulate_zero_size_dims():
+    get_queue_or_skip()
+
+    n0, n1, n2 = 3, 0, 5
+    x = dpt.ones((n0, n1, n2), dtype="i8")
+    r = dpt.cumulative_sum(x, axis=1)
+    assert r.shape == x.shape
+    assert r.size == 0
+
+    r = dpt.cumulative_sum(x, axis=0)
+    assert r.shape == x.shape
+    assert r.size == 0
+
+    r = dpt.cumulative_sum(x, axis=1, include_initial=True)
+    assert r.shape == (n0, n1 + 1, n2)
+    assert r.size == (n0 * n2)
+
+    r = dpt.cumulative_sum(x, axis=0, include_initial=True)
+    assert r.shape == (n0 + 1, n1, n2)
+    assert r.size == 0
+
+
+@pytest.mark.parametrize("arg_dtype", all_types)
+def test_cumsum_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    n = 100
+    x = dpt.ones(n, dtype=arg_dtype)
+    r = dpt.cumulative_sum(x)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    if x.dtype.kind == "i":
+        assert r.dtype.kind == "i"
+    elif x.dtype.kind == "u":
+        assert r.dtype.kind == "u"
+    elif x.dtype.kind == "fc":
+        assert r.dtype == arg_dtype
+
+    r_expected = dpt.arange(1, n + 1, dtype=r.dtype)
+
+    assert dpt.all(r == r_expected)
+
+
+@pytest.mark.parametrize("arg_dtype", all_types)
+@pytest.mark.parametrize("out_dtype", all_types)
+def test_cumsum_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    n = 100
+    x = dpt.ones(n, dtype=arg_dtype)
+    r = dpt.cumulative_sum(x, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+    if out_dtype == dpt.bool:
+        assert dpt.all(r)
+    else:
+        r_expected = dpt.arange(1, n + 1, dtype=out_dtype)
+        assert dpt.all(r == r_expected)
+
+
+def test_accumulator_out_kwarg():
+    q = get_queue_or_skip()
+
+    n = 100
+
+    expected = dpt.arange(1, n + 1, dtype="i4", sycl_queue=q)
+    x = dpt.ones(n, dtype="i4", sycl_queue=q)
+    out = dpt.empty_like(x, dtype="i4")
+    dpt.cumulative_sum(x, dtype="i4", out=out)
+    assert dpt.all(expected == out)
+
+    # overlap
+    x = dpt.ones(n, dtype="i4", sycl_queue=q)
+    dpt.cumulative_sum(x, dtype="i4", out=x)
+    assert dpt.all(x == expected)
+
+    # axis before final axis
+    expected = dpt.broadcast_to(
+        dpt.arange(1, n + 1, dtype="i4", sycl_queue=q), (n, n)
+    ).mT
+    x = dpt.ones((n, n), dtype="i4", sycl_queue=q)
+    out = dpt.empty_like(x, dtype="i4")
+    dpt.cumulative_sum(x, axis=0, dtype="i4", out=out)
+    assert dpt.all(expected == out)
+
+    # scalar
+    x = dpt.asarray(3, dtype="i4")
+    out = dpt.empty((), dtype="i4")
+    expected = 3
+    dpt.cumulative_sum(x, dtype="i4", out=out)
+    assert expected == out
+
+
+def test_accumulator_arg_validation():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    n = 5
+    x1 = dpt.ones((n, n), dtype="f4", sycl_queue=q1)
+    x2 = dpt.ones(n, dtype="f4", sycl_queue=q1)
+
+    # must be usm_ndarray
+    with pytest.raises(TypeError):
+        dpt.cumulative_sum(dict())
+
+    # axis must be specified when input not 1D
+    with pytest.raises(ValueError):
+        dpt.cumulative_sum(x1)
+
+    # out must be usm_ndarray
+    with pytest.raises(TypeError):
+        dpt.cumulative_sum(x2, out=dict())
+
+    # out must be writable
+    out_not_writable = dpt.empty_like(x2)
+    out_not_writable.flags.writable = False
+    with pytest.raises(ValueError):
+        dpt.cumulative_sum(x2, out=out_not_writable)
+
+    # out must be expected shape
+    out_wrong_shape = dpt.ones(n + 1, dtype=x2.dtype, sycl_queue=q1)
+    with pytest.raises(ValueError):
+        dpt.cumulative_sum(x2, out=out_wrong_shape)
+
+    # out must be expected dtype
+    out_wrong_dtype = dpt.empty_like(x2, dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.cumulative_sum(x2, out=out_wrong_dtype)
+
+    # compute follows data
+    out_wrong_queue = dpt.empty_like(x2, sycl_queue=q2)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.cumulative_sum(x2, out=out_wrong_queue)
+
+
+def test_cumsum_nan_propagation():
+    get_queue_or_skip()
+
+    n = 100
+    x = dpt.ones(n, dtype="f4")
+    i = randrange(n)
+    x[i] = dpt.nan
+
+    r = dpt.cumulative_sum(x)
+    assert dpt.all(dpt.isnan(r[i:]))
+
+
+def test_cumprod_nan_propagation():
+    get_queue_or_skip()
+
+    n = 100
+    x = dpt.ones(n, dtype="f4")
+    i = randrange(n)
+    x[i] = dpt.nan
+
+    r = dpt.cumulative_prod(x)
+    assert dpt.all(dpt.isnan(r[i:]))
+
+
+def test_logcumsumexp_nan_propagation():
+    get_queue_or_skip()
+
+    n = 100
+    x = dpt.ones(n, dtype="f4")
+    i = randrange(n)
+    x[i] = dpt.nan
+
+    r = dpt.cumulative_logsumexp(x)
+    assert dpt.all(dpt.isnan(r[i:]))
+
+
+@pytest.mark.parametrize("arg_dtype", no_complex_types)
+def test_logcumsumexp_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    x = dpt.ones(10, dtype=arg_dtype, sycl_queue=q)
+    r = dpt.cumulative_logsumexp(x)
+
+    if arg_dtype.kind in "biu":
+        assert r.dtype.kind == "f"
+    else:
+        assert r.dtype == arg_dtype
+
+
+def test_logcumsumexp_complex_error():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="c8")
+    with pytest.raises(ValueError):
+        dpt.cumulative_logsumexp(x)
+
+
+def test_cumprod_basic():
+    get_queue_or_skip()
+
+    n = 50
+    val = 2
+    x = dpt.full(n, val, dtype="i8")
+    r = dpt.cumulative_prod(x)
+    expected = dpt.pow(val, dpt.arange(1, n + 1, dtype="i8"))
+
+    assert dpt.all(r == expected)
+
+    x = dpt.tile(dpt.asarray([2, 0.5], dtype="f4"), 10000)
+    expected = dpt.tile(dpt.asarray([2, 1], dtype="f4"), 10000)
+    r = dpt.cumulative_prod(x)
+    assert dpt.all(r == expected)
+
+
+def test_logcumsumexp_basic():
+    get_queue_or_skip()
+
+    dt = dpt.float32
+    x = dpt.ones(1000, dtype=dt)
+    r = dpt.cumulative_logsumexp(x)
+
+    expected = 1 + dpt.log(dpt.arange(1, 1001, dtype=dt))
+
+    tol = 4 * dpt.finfo(dt).resolution
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol)
+
+
+def geometric_series_closed_form(n, dtype=None, device=None):
+    """Closed form for cumulative_logsumexp(dpt.arange(-n, 0))
+
+    :math:`r[k] == -n + k + log(1 - exp(-k-1)) - log(1-exp(-1))`
+    """
+    x = dpt.arange(-n, 0, dtype=dtype, device=device)
+    y = dpt.arange(-1, -n - 1, step=-1, dtype=dtype, device=device)
+    y = dpt.exp(y, out=y)
+    y = dpt.negative(y, out=y)
+    y = dpt.log1p(y, out=y)
+    y -= y[0]
+    return x + y
+
+
+@pytest.mark.parametrize("fpdt", rfp_types)
+def test_cumulative_logsumexp_closed_form(fpdt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(fpdt, q)
+
+    n = 128
+    r = dpt.cumulative_logsumexp(dpt.arange(-n, 0, dtype=fpdt, device=q))
+    expected = geometric_series_closed_form(n, dtype=fpdt, device=q)
+
+    tol = 4 * dpt.finfo(fpdt).eps
+    assert dpt.allclose(r, expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("p", [257, 260, 273, 280, 509, 512])
+def test_cumulative_sum_gh_1901(p):
+    get_queue_or_skip()
+
+    n = p * p
+    dt = dpt.int32
+    inp = dpt.ones(n, dtype=dt)
+    r = dpt.cumulative_sum(inp, dtype=dt)
+    assert dpt.all(r == dpt.arange(1, n + 1, dtype=dt))
+
+
+@pytest.mark.parametrize(
+    "dt", ["i1", "i2", "i4", "i8", "f2", "f4", "f8", "c8", "c16"]
+)
+def test_gh_2017(dt):
+    "See https://github.com/IntelPython/dpctl/issues/2017"
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+    x = dpt.asarray([-1, 1], dtype=dpt.dtype(dt), sycl_queue=q)
+    r = dpt.cumulative_sum(x, dtype="?")
+    assert dpt.all(r)
diff --git a/dpnp/tests/tensor/test_tensor_array_api_inspection.py b/dpnp/tests/tensor/test_tensor_array_api_inspection.py
new file mode 100644
index 000000000000..2eb198944656
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_array_api_inspection.py
@@ -0,0 +1,238 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._tensor_impl import (
+    default_device_complex_type,
+    default_device_fp_type,
+    default_device_index_type,
+    default_device_int_type,
+)
+
+_dtypes_no_fp16_fp64 = {
+    "bool": dpt.bool,
+    "float32": dpt.float32,
+    "complex64": dpt.complex64,
+    "int8": dpt.int8,
+    "int16": dpt.int16,
+    "int32": dpt.int32,
+    "int64": dpt.int64,
+    "uint8": dpt.uint8,
+    "uint16": dpt.uint16,
+    "uint32": dpt.uint32,
+    "uint64": dpt.uint64,
+}
+
+
+def test_array_api_inspection_methods():
+    info = dpt.__array_namespace_info__()
+    assert info.capabilities()
+    try:
+        assert info.default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    assert info.default_dtypes()
+    assert info.devices()
+    assert info.dtypes()
+
+
+def test_array_api_inspection_default_device():
+    try:
+        dev = dpctl.select_default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    assert dpt.__array_namespace_info__().default_device() == dev
+
+
+def test_array_api_inspection_devices():
+    try:
+        devices2 = dpctl.get_devices()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    devices1 = dpt.__array_namespace_info__().devices()
+    assert len(devices1) == len(devices2)
+    assert devices1 == devices2
+
+
+def test_array_api_inspection_capabilities():
+    capabilities = dpt.__array_namespace_info__().capabilities()
+    assert capabilities["boolean indexing"]
+    assert capabilities["data-dependent shapes"]
+    assert capabilities["max dimensions"] is None
+
+
+def test_array_api_inspection_default_dtypes():
+    try:
+        dev = dpctl.select_default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    int_dt = default_device_int_type(dev)
+    ind_dt = default_device_index_type(dev)
+    fp_dt = default_device_fp_type(dev)
+    cm_dt = default_device_complex_type(dev)
+
+    info = dpt.__array_namespace_info__()
+    default_dts_nodev = info.default_dtypes()
+    default_dts_dev = info.default_dtypes(device=dev)
+
+    assert (
+        int_dt == default_dts_nodev["integral"] == default_dts_dev["integral"]
+    )
+    assert (
+        ind_dt == default_dts_nodev["indexing"] == default_dts_dev["indexing"]
+    )
+    assert (
+        fp_dt
+        == default_dts_nodev["real floating"]
+        == default_dts_dev["real floating"]
+    )
+    assert (
+        cm_dt
+        == default_dts_nodev["complex floating"]
+        == default_dts_dev["complex floating"]
+    )
+
+
+def test_array_api_inspection_default_device_dtypes():
+    try:
+        dev = dpctl.select_default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    dtypes = _dtypes_no_fp16_fp64.copy()
+    if dev.has_aspect_fp64:
+        dtypes["float64"] = dpt.float64
+        dtypes["complex128"] = dpt.complex128
+
+    assert dtypes == dpt.__array_namespace_info__().dtypes()
+
+
+def test_array_api_inspection_device_dtypes():
+    info = dpt.__array_namespace_info__()
+    try:
+        dev = info.default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    dtypes = _dtypes_no_fp16_fp64.copy()
+    if dev.has_aspect_fp64:
+        dtypes["float64"] = dpt.float64
+        dtypes["complex128"] = dpt.complex128
+
+    assert dtypes == dpt.__array_namespace_info__().dtypes(device=dev)
+
+
+def test_array_api_inspection_dtype_kind():
+    info = dpt.__array_namespace_info__()
+    try:
+        info.default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    f_dtypes = info.dtypes(kind="real floating")
+    assert all([_dt[1].kind == "f" for _dt in f_dtypes.items()])
+
+    i_dtypes = info.dtypes(kind="signed integer")
+    assert all([_dt[1].kind == "i" for _dt in i_dtypes.items()])
+
+    u_dtypes = info.dtypes(kind="unsigned integer")
+    assert all([_dt[1].kind == "u" for _dt in u_dtypes.items()])
+
+    ui_dtypes = info.dtypes(kind="unsigned integer")
+    assert all([_dt[1].kind in "ui" for _dt in ui_dtypes.items()])
+
+    c_dtypes = info.dtypes(kind="complex floating")
+    assert all([_dt[1].kind == "c" for _dt in c_dtypes.items()])
+
+    assert info.dtypes(kind="bool") == {"bool": dpt.bool}
+
+    _signed_ints = {
+        "int8": dpt.int8,
+        "int16": dpt.int16,
+        "int32": dpt.int32,
+        "int64": dpt.int64,
+    }
+    assert (
+        info.dtypes(kind=("signed integer", "signed integer")) == _signed_ints
+    )
+    assert (
+        info.dtypes(
+            kind=("integral", "bool", "real floating", "complex floating")
+        )
+        == info.dtypes()
+    )
+    assert info.dtypes(
+        kind=("integral", "real floating", "complex floating")
+    ) == info.dtypes(kind="numeric")
+
+
+def test_array_api_inspection_dtype_kind_errors():
+    info = dpt.__array_namespace_info__()
+    try:
+        info.default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    with pytest.raises(ValueError):
+        info.dtypes(kind="error")
+
+    with pytest.raises(TypeError):
+        info.dtypes(kind={0: "real floating"})
+
+
+def test_array_api_inspection_device_types():
+    info = dpt.__array_namespace_info__()
+    try:
+        dev = info.default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    q = dpctl.SyclQueue(dev)
+    assert info.default_dtypes(device=q)
+    assert info.dtypes(device=q)
+
+    dev_dpt = dpt.Device.create_device(dev)
+    assert info.default_dtypes(device=dev_dpt)
+    assert info.dtypes(device=dev_dpt)
+
+    filter = dev.get_filter_string()
+    assert info.default_dtypes(device=filter)
+    assert info.dtypes(device=filter)
+
+
+def test_array_api_inspection_device_errors():
+    info = dpt.__array_namespace_info__()
+
+    bad_dev = {}
+    with pytest.raises(TypeError):
+        info.dtypes(device=bad_dev)
+
+    with pytest.raises(TypeError):
+        info.default_dtypes(device=bad_dev)
diff --git a/dpnp/tests/tensor/test_tensor_asarray.py b/dpnp/tests/tensor/test_tensor_asarray.py
new file mode 100644
index 000000000000..f5caacacdac6
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_asarray.py
@@ -0,0 +1,664 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+@pytest.mark.parametrize(
+    "src_usm_type, dst_usm_type",
+    [
+        ("device", "shared"),
+        ("device", "host"),
+        ("shared", "device"),
+        ("shared", "host"),
+        ("host", "device"),
+        ("host", "shared"),
+    ],
+)
+def test_asarray_change_usm_type(src_usm_type, dst_usm_type):
+    try:
+        d = dpctl.SyclDevice()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X = dpt.empty(10, dtype="u1", usm_type=src_usm_type)
+    Y = dpt.asarray(X, usm_type=dst_usm_type)
+    assert X.shape == Y.shape
+    assert X.usm_type == src_usm_type
+    assert Y.usm_type == dst_usm_type
+
+    with pytest.raises(ValueError):
+        # zero copy is not possible
+        dpt.asarray(X, usm_type=dst_usm_type, copy=False)
+
+    Y = dpt.asarray(X, usm_type=dst_usm_type, sycl_queue=X.sycl_queue)
+    assert X.shape == Y.shape
+    assert Y.usm_type == dst_usm_type
+
+    Y = dpt.asarray(
+        X,
+        usm_type=dst_usm_type,
+        sycl_queue=X.sycl_queue,
+        device=d.get_filter_string(),
+    )
+    assert X.shape == Y.shape
+    assert Y.usm_type == dst_usm_type
+
+
+def test_asarray_from_numpy():
+    Xnp = np.arange(10)
+    try:
+        Y = dpt.asarray(Xnp, usm_type="device")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.shape == Xnp.shape
+    assert Y.dtype == Xnp.dtype
+    # Fortran contiguous case
+    Xnp = np.array([[1, 2, 3], [4, 5, 6]], dtype="f4", order="F")
+    Y = dpt.asarray(Xnp, usm_type="shared")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.shape == Xnp.shape
+    assert Y.dtype == Xnp.dtype
+    # general strided case
+    Xnp = np.array([[1, 2, 3], [4, 5, 6]], dtype="i8")
+    Y = dpt.asarray(Xnp[::-1, ::-1], usm_type="host")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.shape == Xnp.shape
+    assert Y.dtype == Xnp.dtype
+
+
+def test_asarray_from_sequence():
+    X = [1, 2, 3]
+    try:
+        Y = dpt.asarray(X, usm_type="device")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert type(Y) is dpt.usm_ndarray
+
+    X = [(1, 1), (2.0, 2.0 + 1.0j), range(4, 6), np.array([3, 4], dtype="c16")]
+    Y = dpt.asarray(X, usm_type="device")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.ndim == 2
+    assert Y.shape == (len(X), 2)
+
+    X = []
+    Y = dpt.asarray(X, usm_type="device")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.shape == (0,)
+
+    X = [[], []]
+    Y = dpt.asarray(X, usm_type="device")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.shape == (2, 0)
+
+    X = [True, False]
+    Y = dpt.asarray(X, usm_type="device")
+    assert type(Y) is dpt.usm_ndarray
+    assert Y.dtype.kind == "b"
+
+
+def test_asarray_from_object_with_suai():
+    """Test that asarray can deal with opaque objects implementing SUAI"""
+
+    class Dummy:
+        def __init__(self, obj, iface):
+            self.obj = obj
+            self.__sycl_usm_array_interface__ = iface
+
+    try:
+        X = dpt.empty((2, 3, 4), dtype="f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Y = dpt.asarray(Dummy(X, X.__sycl_usm_array_interface__))
+    assert Y.shape == X.shape
+    assert X.usm_type == Y.usm_type
+    assert X.dtype == Y.dtype
+    assert X.sycl_device == Y.sycl_device
+
+
+def test_asarray_input_validation():
+    with pytest.raises(TypeError):
+        # copy keyword is not of right type
+        dpt.asarray([1], copy="invalid")
+    with pytest.raises(TypeError):
+        # order keyword is not valid
+        dpt.asarray([1], order=1)
+    with pytest.raises(TypeError):
+        # dtype is not valid
+        dpt.asarray([1], dtype="invalid")
+    with pytest.raises(ValueError):
+        # unexpected value of order
+        dpt.asarray([1], order="Z")
+    with pytest.raises(TypeError):
+        # usm_type is of wrong type
+        dpt.asarray([1], usm_type=dict())
+    with pytest.raises(ValueError):
+        # usm_type has wrong value
+        dpt.asarray([1], usm_type="mistake")
+    try:
+        wrong_queue_type = dpctl.SyclContext()
+    except dpctl.SyclContextCreationError:
+        # use any other type
+        wrong_queue_type = Ellipsis
+    with pytest.raises(TypeError):
+        # sycl_queue type is not right
+        dpt.asarray([1], sycl_queue=wrong_queue_type)
+    with pytest.raises(ValueError):
+        # sequence is not rectangular
+        dpt.asarray([[1], 2])
+    with pytest.raises(OverflowError):
+        # Python int too large for type
+        dpt.asarray(-9223372036854775809, dtype="i4")
+    with pytest.raises(ValueError):
+        # buffer to usm_ndarray requires a copy
+        dpt.asarray(memoryview(np.arange(5)), copy=False)
+    with pytest.raises(ValueError):
+        # Numpy array to usm_ndarray requires a copy
+        dpt.asarray(np.arange(5), copy=False)
+    with pytest.raises(ValueError):
+        # Python sequence to usm_ndarray requires a copy
+        dpt.asarray([1, 2, 3], copy=False)
+    with pytest.raises(ValueError):
+        # Python scalar to usm_ndarray requires a copy
+        dpt.asarray(5, copy=False)
+
+
+def test_asarray_input_validation2():
+    d = dpctl.get_devices()
+    if len(d) < 2:
+        pytest.skip("Not enough SYCL devices available")
+
+    d0, d1 = d[:2]
+    try:
+        q0 = dpctl.SyclQueue(d0)
+    except dpctl.SyclQueueCreationError:
+        pytest.skip(f"SyclQueue could not be created for {d0}")
+    try:
+        q1 = dpctl.SyclQueue(d1)
+    except dpctl.SyclQueueCreationError:
+        pytest.skip(f"SyclQueue could not be created for {d1}")
+    with pytest.raises(TypeError):
+        dpt.asarray([1, 2], sycl_queue=q0, device=q1)
+
+
+def test_asarray_scalars():
+    import ctypes
+
+    try:
+        Y = dpt.asarray(5)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert Y.dtype == dpt.dtype(int)
+    Y = dpt.asarray(5.2)
+    if Y.sycl_device.has_aspect_fp64:
+        assert Y.dtype == dpt.dtype(float)
+    else:
+        assert Y.dtype == dpt.dtype(dpt.float32)
+    Y = dpt.asarray(np.float32(2.3))
+    assert Y.dtype == dpt.dtype(dpt.float32)
+    Y = dpt.asarray(1.0j)
+    if Y.sycl_device.has_aspect_fp64:
+        assert Y.dtype == dpt.dtype(complex)
+    else:
+        assert Y.dtype == dpt.dtype(dpt.complex64)
+    Y = dpt.asarray(ctypes.c_int(8))
+    assert Y.dtype == dpt.dtype(ctypes.c_int)
+
+
+def test_asarray_copy_false():
+    q = get_queue_or_skip()
+    rng = np.random.default_rng()
+    Xnp = rng.integers(low=-255, high=255, size=(10, 4), dtype=np.int64)
+    X = dpt.from_numpy(Xnp, usm_type="device", sycl_queue=q)
+    Y1 = dpt.asarray(X, copy=False, order="K")
+    assert Y1 is X
+    Y1c = dpt.asarray(X, copy=True, order="K")
+    assert not (Y1c is X)
+    Y2 = dpt.asarray(X, copy=False, order="C")
+    assert Y2 is X
+    Y3 = dpt.asarray(X, copy=False, order="A")
+    assert Y3 is X
+    with pytest.raises(ValueError):
+        Y1 = dpt.asarray(X, copy=False, order="F")
+    Xf = dpt.empty(
+        X.shape,
+        dtype=X.dtype,
+        usm_type="device",
+        sycl_queue=X.sycl_queue,
+        order="F",
+    )
+    Xf[:] = X
+    Y4 = dpt.asarray(Xf, copy=False, order="K")
+    assert Y4 is Xf
+    Y5 = dpt.asarray(Xf, copy=False, order="F")
+    assert Y5 is Xf
+    Y6 = dpt.asarray(Xf, copy=False, order="A")
+    assert Y6 is Xf
+    with pytest.raises(ValueError):
+        dpt.asarray(Xf, copy=False, order="C")
+
+
+def test_asarray_invalid_dtype():
+    q = get_queue_or_skip()
+    Xnp = np.array([1, 2, 3], dtype=object)
+    with pytest.raises(TypeError):
+        dpt.asarray(Xnp, sycl_queue=q)
+
+
+def test_asarray_cross_device():
+    q = get_queue_or_skip()
+    qprof = dpctl.SyclQueue(property="enable_profiling")
+    x = dpt.empty(10, dtype="i8", sycl_queue=q)
+    y = dpt.asarray(x, sycl_queue=qprof)
+    assert y.sycl_queue == qprof
+
+
+def test_asarray_seq_of_arrays_simple():
+    get_queue_or_skip()
+    r = dpt.arange(10)
+    m = dpt.asarray(
+        [
+            r,
+        ]
+        * 4
+    )
+    assert m.shape == (4,) + r.shape
+    assert m.dtype == r.dtype
+    assert m.device == r.device
+
+
+def test_asarray_seq_of_arrays():
+    get_queue_or_skip()
+    m = dpt.ones((2, 4), dtype="i4")
+    w = dpt.zeros(4)
+    v = dpt.full(4, -1)
+    ar = dpt.asarray([m, [w, v]])
+    assert ar.shape == (2, 2, 4)
+    assert ar.device == m.device
+    assert ar.device == w.device
+    assert ar.device == v.device
+
+
+def test_asarray_seq_of_array_different_queue():
+    get_queue_or_skip()
+    m = dpt.ones((2, 4), dtype="i4")
+    w = dpt.zeros(4)
+    v = dpt.full(4, -1)
+    qprof = dpctl.SyclQueue(property="enable_profiling")
+    ar = dpt.asarray([m, [w, v]], sycl_queue=qprof)
+    assert ar.shape == (2, 2, 4)
+    assert ar.sycl_queue == qprof
+
+
+def test_asarray_seq_of_suai():
+    get_queue_or_skip()
+
+    class Dummy:
+        def __init__(self, obj, iface):
+            self.obj = obj
+            self.__sycl_usm_array_interface__ = iface
+
+    o = dpt.empty(0, usm_type="shared")
+    d = Dummy(o, o.__sycl_usm_array_interface__)
+    x = dpt.asarray(d)
+    assert x.shape == (0,)
+    assert x.usm_type == o.usm_type
+    assert x._pointer == o._pointer
+    assert x.sycl_queue == o.sycl_queue
+
+    x = dpt.asarray([d, d])
+    assert x.shape == (2, 0)
+    assert x.usm_type == o.usm_type
+    assert x.sycl_queue == o.sycl_queue
+
+
+def test_asarray_seq_of_suai_different_queue():
+    q = get_queue_or_skip()
+
+    class Dummy:
+        def __init__(self, obj, iface):
+            self.obj = obj
+            self.__sycl_usm_array_interface__ = iface
+
+        @property
+        def shape(self):
+            return self.__sycl_usm_array_interface__["shape"]
+
+    q2 = dpctl.SyclQueue()
+    assert q != q2
+    o = dpt.empty((2, 2), usm_type="shared", sycl_queue=q2)
+    d = Dummy(o, o.__sycl_usm_array_interface__)
+
+    x = dpt.asarray(d, sycl_queue=q)
+    assert x.sycl_queue == q
+    assert x.shape == d.shape
+    x = dpt.asarray([d], sycl_queue=q)
+    assert x.sycl_queue == q
+    assert x.shape == (1,) + d.shape
+    x = dpt.asarray([d, d], sycl_queue=q)
+    assert x.sycl_queue == q
+    assert x.shape == (2,) + d.shape
+
+
+def test_asarray_seq_of_arrays_on_different_queues():
+    q = get_queue_or_skip()
+
+    m = dpt.empty((2, 4), dtype="i2", sycl_queue=q)
+    q2 = dpctl.SyclQueue()
+    w = dpt.empty(4, dtype="i1", sycl_queue=q2)
+    q3 = dpctl.SyclQueue()
+    py_seq = [
+        0,
+    ] * w.shape[0]
+    res = dpt.asarray([m, [w, py_seq]], sycl_queue=q3)
+    assert res.sycl_queue == q3
+    assert dpt.isdtype(res.dtype, "integral")
+
+    res = dpt.asarray([m, [w, range(w.shape[0])]], sycl_queue=q3)
+    assert res.sycl_queue == q3
+    assert dpt.isdtype(res.dtype, "integral")
+
+    res = dpt.asarray([m, [w, w]], sycl_queue=q)
+    assert res.sycl_queue == q
+    assert dpt.isdtype(res.dtype, "integral")
+
+    res = dpt.asarray([m, [w, dpt.asnumpy(w)]], sycl_queue=q2)
+    assert res.sycl_queue == q2
+    assert dpt.isdtype(res.dtype, "integral")
+
+    res = dpt.asarray([w, dpt.asnumpy(w)])
+    assert res.sycl_queue == w.sycl_queue
+    assert dpt.isdtype(res.dtype, "integral")
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.asarray([m, [w, py_seq]])
+
+
+def test_ulonglong_gh_1167():
+    get_queue_or_skip()
+    x = dpt.asarray(9223372036854775807, dtype="u8")
+    assert x.dtype == dpt.uint64
+    x = dpt.asarray(9223372036854775808, dtype="u8")
+    assert x.dtype == dpt.uint64
+
+
+def test_orderK_gh_1350():
+    get_queue_or_skip()
+    a = dpt.empty((2, 3, 4), dtype="u1")
+    b = dpt.permute_dims(a, (2, 0, 1))
+    c = dpt.asarray(b, copy=True, order="K")
+
+    assert c.shape == b.shape
+    assert c.strides == b.strides
+    assert c._element_offset == 0
+    assert not c._pointer == b._pointer
+
+
+def _typesafe_arange(n: int, dtype_: dpt.dtype, device: object):
+    n_half = n // 2
+    if dtype_.kind in "ui":
+        ii = dpt.iinfo(dtype_)
+        m0 = max(ii.min, -n_half)
+        m1 = min(m0 + n, ii.max)
+        n_tiles = (n + m1 - m0 - 1) // (m1 - m0)
+        res = dpt.arange(m0, m1, dtype=dtype_, device=device)
+    elif dtype_.kind == "b":
+        n_tiles = (n + 1) // 2
+        res = dpt.asarray([False, True], dtype=dtype_, device=device)
+    else:
+        m0 = -n_half
+        m1 = m0 + n
+        n_tiles = 1
+        res = dpt.linspace(m0, m1, num=n, dtype=dtype_, device=device)
+    if n_tiles > 1:
+        res = dpt.tile(res, n_tiles)[:n]
+    return res
+
+
+_all_dtypes = [
+    "b1",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_as_c_contig_rect(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1, n2 = 6, 35, 37
+
+    arr_flat = _typesafe_arange(n0 * n1 * n2, dtype_, q)
+    x = dpt.reshape(arr_flat, (n0, n1, n2)).mT
+
+    y = dpt.asarray(x, order="C")
+    assert dpt.all(x == y)
+
+    x2 = x[0]
+    y2 = dpt.asarray(x2, order="C")
+    assert dpt.all(x2 == y2)
+
+    x3 = dpt.flip(x, axis=1)
+    y3 = dpt.asarray(x3, order="C")
+    assert dpt.all(x3 == y3)
+
+    x4 = dpt.reshape(arr_flat, (2, 3, n1, n2)).mT
+    x5 = x4[:, :2]
+    y5 = dpt.asarray(x5, order="C")
+    assert dpt.all(x5 == y5)
+
+    x6 = dpt.reshape(arr_flat, (n0, n1, n2), order="F")
+    y6 = dpt.asarray(x6, order="C")
+    assert dpt.all(x6 == y6)
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_as_f_contig_rect(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1, n2 = 6, 35, 37
+
+    arr_flat = _typesafe_arange(n0 * n1 * n2, dtype_, q)
+    x = dpt.reshape(arr_flat, (n0, n1, n2))
+
+    y = dpt.asarray(x, order="F")
+    assert dpt.all(x == y)
+
+    x2 = x[0]
+    y2 = dpt.asarray(x2, order="F")
+    assert dpt.all(x2 == y2)
+
+    x3 = dpt.flip(x, axis=1)
+    y3 = dpt.asarray(x3, order="F")
+    assert dpt.all(x3 == y3)
+
+    x4 = dpt.reshape(arr_flat, (2, 3, n1, n2))
+    x5 = dpt.moveaxis(x4[:, :2], (2, 3), (0, 1))
+    y5 = dpt.asarray(x5, order="F")
+    assert dpt.all(x5 == y5)
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_as_c_contig_square(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1 = 4, 53
+
+    arr_flat = _typesafe_arange(n0 * n1 * n1, dtype_, q)
+    x = dpt.reshape(arr_flat, (n0, n1, n1)).mT
+
+    y = dpt.asarray(x, order="C")
+    assert dpt.all(x == y)
+
+    x2 = x[0]
+    y2 = dpt.asarray(x2, order="C")
+    assert dpt.all(x2 == y2)
+
+    x3 = dpt.flip(x, axis=1)
+    y3 = dpt.asarray(x3, order="C")
+    assert dpt.all(x3 == y3)
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_as_f_contig_square(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1 = 6, 53
+
+    arr_flat = _typesafe_arange(n0 * n1 * n1, dtype_, q)
+    x = dpt.moveaxis(dpt.reshape(arr_flat, (n0, n1, n1)), (1, 2), (0, 1))
+
+    y = dpt.asarray(x, order="F")
+    assert dpt.all(x == y)
+
+    x2 = x[..., 0]
+    y2 = dpt.asarray(x2, order="F")
+    assert dpt.all(x2 == y2)
+
+    x3 = dpt.flip(x, axis=1)
+    y3 = dpt.asarray(x3, order="F")
+    assert dpt.all(x3 == y3)
+
+
+class MockArrayWithBothProtocols:
+    """
+    Object that implements both __sycl_usm_array_interface__
+    and __usm_ndarray__ properties.
+    """
+
+    def __init__(self, usm_ar):
+        if not isinstance(usm_ar, dpt.usm_ndarray):
+            raise TypeError
+        self._arr = usm_ar
+
+    @property
+    def __usm_ndarray__(self):
+        return self._arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self._arr.__sycl_usm_array_interface__
+
+
+class MockArrayWithSUAIOnly:
+    """
+    Object that implements only the
+    __sycl_usm_array_interface__ property.
+    """
+
+    def __init__(self, usm_ar):
+        if not isinstance(usm_ar, dpt.usm_ndarray):
+            raise TypeError
+        self._arr = usm_ar
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self._arr.__sycl_usm_array_interface__
+
+
+@pytest.mark.parametrize("usm_type", ["shared", "device", "host"])
+def test_asarray_support_for_usm_ndarray_protocol(usm_type):
+    get_queue_or_skip()
+
+    x = dpt.arange(256, dtype="i4", usm_type=usm_type)
+
+    o1 = MockArrayWithBothProtocols(x)
+    o2 = MockArrayWithSUAIOnly(x)
+
+    y1 = dpt.asarray(o1)
+    assert x.sycl_queue == y1.sycl_queue
+    assert x.usm_type == y1.usm_type
+    assert x.dtype == y1.dtype
+    assert y1.usm_data.reference_obj is None
+    assert dpt.all(x == y1)
+
+    y2 = dpt.asarray(o2)
+    assert x.sycl_queue == y2.sycl_queue
+    assert x.usm_type == y2.usm_type
+    assert x.dtype == y2.dtype
+    assert not (y2.usm_data.reference_obj is None)
+    assert dpt.all(x == y2)
+
+    y3 = dpt.asarray([o1, o2])
+    assert x.sycl_queue == y3.sycl_queue
+    assert x.usm_type == y3.usm_type
+    assert x.dtype == y3.dtype
+    assert y3.usm_data.reference_obj is None
+    assert dpt.all(x[dpt.newaxis, :] == y3)
+
+
+@pytest.mark.parametrize("dt", [dpt.float16, dpt.float64, dpt.complex128])
+def test_asarray_to_device_with_unsupported_dtype(dt):
+    aspect = "fp16" if dt == dpt.float16 else "fp64"
+    try:
+        d0 = dpctl.select_device_with_aspects(aspect)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No device with aspect for test")
+    d1 = None
+    for d in dpctl.get_devices():
+        if d.default_selector_score < 0:
+            pass
+        try:
+            d1 = dpctl.select_device_with_aspects(
+                d.device_type.name, excluded_aspects=[aspect]
+            )
+        except dpctl.SyclDeviceCreationError:
+            pass
+    if d1 is None:
+        pytest.skip("No device with missing aspect for test")
+    x = dpt.ones(10, dtype=dt, device=d0)
+    y = dpt.asarray(x, device=d1)
+    assert y.sycl_device == d1
diff --git a/dpnp/tests/tensor/test_tensor_clip.py b/dpnp/tests/tensor/test_tensor_clip.py
new file mode 100644
index 000000000000..cfd9f6cfab2e
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_clip.py
@@ -0,0 +1,792 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import numpy as np
+import pytest
+from numpy.testing import assert_raises_regex
+
+import dpnp.tensor as dpt
+from dpnp.tensor._elementwise_common import _get_dtype
+from dpnp.tensor._type_utils import (
+    _can_cast,
+    _strong_dtype_num_kind,
+    _weak_type_num_kind,
+)
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "?",
+    "u1",
+    "i1",
+    "u2",
+    "i2",
+    "u4",
+    "i4",
+    "u8",
+    "i8",
+    "e",
+    "f",
+    "d",
+    "F",
+    "D",
+]
+
+_usm_types = ["device", "shared", "host"]
+
+
+@pytest.mark.parametrize("dt1", _all_dtypes)
+@pytest.mark.parametrize("dt2", _all_dtypes)
+def test_clip_dtypes(dt1, dt2):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt1, q)
+    skip_if_dtype_not_supported(dt2, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=dt1, sycl_queue=q)
+    ar2 = dpt.ones_like(ar1, dtype=dt1, sycl_queue=q)
+    ar3 = dpt.ones_like(ar1, dtype=dt2, sycl_queue=q)
+
+    dev = q.sycl_device
+    _fp16 = dev.has_aspect_fp16
+    _fp64 = dev.has_aspect_fp64
+    # also covers cases where dt1 == dt2
+    if _can_cast(ar3.dtype, ar1.dtype, _fp16, _fp64):
+        r = dpt.clip(ar1, ar2, ar3)
+        assert isinstance(r, dpt.usm_ndarray)
+        assert r.dtype == ar1.dtype
+        assert r.shape == ar1.shape
+        assert dpt.all(r == ar1)
+        assert r.sycl_queue == ar1.sycl_queue
+
+        r = dpt.clip(ar1, min=ar3, max=None)
+        assert isinstance(r, dpt.usm_ndarray)
+        assert r.dtype == ar1.dtype
+        assert r.shape == ar1.shape
+        assert dpt.all(r == ar1)
+        assert r.sycl_queue == ar1.sycl_queue
+
+        r = dpt.clip(ar1, min=None, max=ar3)
+        assert isinstance(r, dpt.usm_ndarray)
+        assert r.dtype == ar1.dtype
+        assert r.shape == ar1.shape
+        assert dpt.all(r == ar1)
+        assert r.sycl_queue == ar1.sycl_queue
+    else:
+        with pytest.raises(ValueError):
+            dpt.clip(ar1, ar2, ar3)
+        with pytest.raises(ValueError):
+            dpt.clip(ar1, min=ar3, max=None)
+        with pytest.raises(ValueError):
+            dpt.clip(ar1, min=None, max=ar3)
+
+
+def test_clip_empty():
+    get_queue_or_skip()
+
+    x = dpt.empty((2, 0, 3), dtype="i4")
+    a_min = dpt.ones((2, 0, 3), dtype="i4")
+    a_max = dpt.ones((2, 0, 3), dtype="i4")
+
+    r = dpt.clip(x, a_min, a_max)
+    assert r.size == 0
+    assert r.shape == x.shape
+
+
+def test_clip_python_scalars():
+    get_queue_or_skip()
+
+    arrs = [
+        dpt.ones(1, dtype="?"),
+        dpt.ones(1, dtype="i4"),
+        dpt.ones(1, dtype="f4"),
+        dpt.ones(1, dtype="c8"),
+    ]
+
+    py_zeros = [
+        False,
+        0,
+        0.0,
+        complex(0, 0),
+    ]
+
+    py_ones = [
+        True,
+        1,
+        1.0,
+        complex(1, 0),
+    ]
+
+    for zero, one, arr in zip(py_zeros, py_ones, arrs):
+        r = dpt.clip(arr, zero, one)
+        assert isinstance(r, dpt.usm_ndarray)
+        r = dpt.clip(arr, min=zero)
+        assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_clip_in_place():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    a_min = dpt.arange(1, 11, dtype="i4")
+    a_max = dpt.arange(2, 12, dtype="i4")
+    dpt.clip(x, a_min, a_max, out=x)
+    assert dpt.all(x == a_min)
+
+    x = dpt.arange(10, dtype="i4")
+    dpt.clip(x, min=a_min, max=None, out=x)
+    assert dpt.all(x == a_min)
+
+    x = dpt.arange(10, dtype="i4")
+    dpt.clip(x, a_min, a_max, out=a_max)
+    assert dpt.all(a_max == a_min)
+
+    a_min = dpt.arange(1, 11, dtype="i4")
+    dpt.clip(x, min=a_min, max=None, out=a_min[::-1])
+    assert dpt.all((x + 1)[::-1] == a_min)
+
+
+def test_clip_special_cases():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="f4")
+    r = dpt.clip(x, -dpt.inf, dpt.inf)
+    assert dpt.all(r == x)
+    r = dpt.clip(x, dpt.nan, dpt.inf)
+    assert dpt.all(dpt.isnan(r))
+    r = dpt.clip(x, -dpt.inf, dpt.nan)
+    assert dpt.all(dpt.isnan(r))
+
+
+def test_clip_out_need_temporary():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i4")
+    a_max = dpt.asarray(3, dtype="i4")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i4")
+    a_max = dpt.asarray(3, dtype="i2")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i2")
+    a_max = dpt.asarray(3, dtype="i4")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.ones(10, dtype="i4")
+    a_min = dpt.asarray(2, dtype="i2")
+    a_max = dpt.asarray(3, dtype="i1")
+    dpt.clip(x[:6], 2, 3, out=x[-6:])
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
+
+    x = dpt.arange(12, dtype="i4")
+    dpt.clip(x[:6], out=x[-6:])
+    expected = dpt.arange(6, dtype="i4")
+    assert dpt.all(x[:-6] == expected) and dpt.all(x[-6:] == expected)
+
+    x = dpt.ones(10, dtype="i4")
+    dpt.clip(x, out=x)
+    assert dpt.all(x == 1)
+
+    x = dpt.full(6, 3, dtype="i4")
+    a_min = dpt.full(10, 2, dtype="i4")
+    a_max = dpt.asarray(4, dtype="i4")
+    dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:])
+    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
+
+    x = dpt.full(6, 3, dtype="i4")
+    a_min = dpt.full(10, 2, dtype="i4")
+    a_max = dpt.asarray(4, dtype="i2")
+    dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:])
+    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
+
+
+def test_clip_out_need_temporary_none():
+    get_queue_or_skip()
+
+    x = dpt.full(6, 3, dtype="i4")
+    # with min/max == None
+    a_min = dpt.full(10, 2, dtype="i4")
+    dpt.clip(x, min=a_min[:6], max=None, out=a_min[-6:])
+    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
+
+
+def test_clip_arg_validation():
+    get_queue_or_skip()
+
+    check = {}
+    x1 = dpt.empty((1,), dtype="i4")
+    x2 = dpt.empty((1,), dtype="i4")
+
+    with pytest.raises(TypeError):
+        dpt.clip(check, x1, x2)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x1, check, x2)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x1, check)
+
+    with pytest.raises(TypeError):
+        dpt.clip(x1, x1, x2, out=check)
+
+    with pytest.raises(TypeError):
+        dpt.clip(x1, x2, out=check)
+
+    with pytest.raises(TypeError):
+        dpt.clip(x1, out=check)
+
+
+@pytest.mark.parametrize(
+    "dt1,dt2", [("i4", "i4"), ("i4", "i2"), ("i2", "i4"), ("i1", "i2")]
+)
+def test_clip_order(dt1, dt2):
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="C")
+    ar2 = dpt.ones(test_shape, dtype=dt1, order="C")
+    ar3 = dpt.ones(test_shape, dtype=dt2, order="C")
+    r1 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, ar2, ar3, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, ar2, ar3, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="F")
+    ar2 = dpt.ones(test_shape, dtype=dt1, order="F")
+    ar3 = dpt.ones(test_shape, dtype=dt2, order="F")
+    r1 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, ar2, ar3, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, ar2, ar3, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
+    ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.strides == (n, -1)
+    r5 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r5.strides == (n, 1)
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
+    ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
+    r4 = dpt.clip(ar1, ar2, ar3, order="K")
+    assert r4.strides == (-1, n)
+    r5 = dpt.clip(ar1, ar2, ar3, order="C")
+    assert r5.strides == (n, 1)
+
+
+@pytest.mark.parametrize("dt", ["i4", "i2"])
+def test_clip_none_order(dt):
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="C")
+    ar2 = dpt.ones(test_shape, dtype=dt, order="C")
+
+    r1 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, min=None, max=ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, min=None, max=ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones(test_shape, dtype="i4", order="F")
+    ar2 = dpt.ones(test_shape, dtype=dt, order="F")
+
+    r1 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.clip(ar1, min=None, max=ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.clip(ar1, min=None, max=ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2]
+
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.strides == (n, -1)
+    r5 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r5.strides == (n, 1)
+
+    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2].mT
+
+    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
+    assert r4.strides == (-1, n)
+    r5 = dpt.clip(ar1, min=None, max=ar2, order="C")
+    assert r5.strides == (n, 1)
+
+
+@pytest.mark.parametrize("usm_type1", _usm_types)
+@pytest.mark.parametrize("usm_type2", _usm_types)
+@pytest.mark.parametrize("usm_type3", _usm_types)
+def test_clip_usm_type_matrix(usm_type1, usm_type2, usm_type3):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2)
+    ar3 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type3)
+
+    r = dpt.clip(ar1, ar2, ar3)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type(
+        (usm_type1, usm_type2, usm_type3)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+@pytest.mark.parametrize("usm_type1", _usm_types)
+@pytest.mark.parametrize("usm_type2", _usm_types)
+def test_clip_usm_type_matrix_none_arg(usm_type1, usm_type2):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2)
+
+    r = dpt.clip(ar1, min=ar2, max=None)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpt.get_coerced_usm_type((usm_type1, usm_type2))
+    assert r.usm_type == expected_usm_type
+
+
+def test_clip_dtype_error():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones(1, dtype="i4")
+    ar2 = dpt.ones(1, dtype="i4")
+    ar3 = dpt.ones(1, dtype="i4")
+    ar4 = dpt.empty_like(ar1, dtype="f4")
+
+    assert_raises_regex(
+        ValueError,
+        "Output array of type.*is needed",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+    assert_raises_regex(
+        ValueError,
+        "Output array of type.*is needed",
+        dpt.clip,
+        ar1,
+        ar2,
+        None,
+        ar4,
+    )
+
+
+def test_clip_errors():
+    get_queue_or_skip()
+    try:
+        gpu_queue = dpctl.SyclQueue("gpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('gpu') failed, skipping")
+    try:
+        cpu_queue = dpctl.SyclQueue("cpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('cpu') failed, skipping")
+
+    ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
+    ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
+    ar3 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
+    ar4 = dpt.empty_like(ar1, sycl_queue=cpu_queue)
+    assert_raises_regex(
+        dpt.ExecutionPlacementError,
+        "Input and output allocation queues are not compatible",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        dpt.ExecutionPlacementError,
+        "Input and output allocation queues are not compatible",
+        dpt.clip,
+        ar1,
+        None,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        dpt.ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        ar4,
+        ar2,
+        ar3,
+    )
+
+    assert_raises_regex(
+        dpt.ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        ar4,
+        1,
+        ar3,
+    )
+
+    assert_raises_regex(
+        dpt.ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        1,
+        ar4,
+        ar3,
+    )
+
+    assert_raises_regex(
+        dpt.ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred from input "
+        "arguments.",
+        dpt.clip,
+        ar1,
+        ar4,
+        None,
+        ar2,
+    )
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = dpt.ones_like(ar1, dtype="float32")
+    ar3 = dpt.ones_like(ar1, dtype="float32")
+    ar4 = dpt.empty(3, dtype="float32")
+    assert_raises_regex(
+        ValueError,
+        "The shape of input and output arrays are inconsistent",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        ValueError,
+        "The shape of input and output arrays are inconsistent",
+        dpt.clip,
+        ar1,
+        ar2,
+        None,
+        ar4,
+    )
+
+    ar1 = np.ones(2, dtype="f4")
+    ar2 = dpt.ones(2, dtype="f4")
+    ar3 = dpt.ones(2, dtype="f4")
+    assert_raises_regex(
+        TypeError,
+        "Expected `x` to be of dpnp.tensor.usm_ndarray type*",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+    )
+
+    ar1 = dpt.ones(2, dtype="i4")
+    ar2 = dpt.ones_like(ar1, dtype="i4")
+    ar3 = dpt.ones_like(ar1, dtype="i4")
+    ar4 = np.empty(ar1.shape, dtype=ar1.dtype)
+    assert_raises_regex(
+        TypeError,
+        "output array must be of usm_ndarray type",
+        dpt.clip,
+        ar1,
+        ar2,
+        ar3,
+        ar4,
+    )
+
+    assert_raises_regex(
+        TypeError,
+        "output array must be of usm_ndarray type",
+        dpt.clip,
+        ar1,
+        ar2,
+        None,
+        ar4,
+    )
+
+
+def test_clip_out_type_check():
+    get_queue_or_skip()
+
+    x1 = dpt.ones(10)
+    x2 = dpt.ones(10)
+    x3 = dpt.ones(10)
+
+    out = range(10)
+
+    with pytest.raises(TypeError):
+        dpt.clip(x1, x2, x3, out=out)
+
+
+@pytest.mark.parametrize("dt", ["i4", "f4", "c8"])
+def test_clip_basic(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    sz = 1026
+    x = dpt.arange(sz, dtype=dt, sycl_queue=q)
+    r = dpt.clip(x, min=100, max=500)
+    expected = dpt.arange(sz, dtype=dt, sycl_queue=q)
+    expected[:100] = 100
+    expected[500:] = 500
+    assert dpt.all(expected == r)
+
+    x = dpt.zeros(sz, dtype=dt, sycl_queue=q)
+    a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q)
+    a_max[::2] = -2
+    r = dpt.clip(x, min=-3, max=a_max)
+    assert dpt.all(a_max == r)
+
+
+@pytest.mark.parametrize("dt", ["i4", "f4", "c8"])
+def test_clip_strided(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    sz = 2 * 1026
+    x = dpt.arange(sz, dtype=dt, sycl_queue=q)[::-2]
+    r = dpt.clip(x, min=100, max=500)
+    expected = dpt.arange(sz, dtype=dt, sycl_queue=q)
+    expected[:100] = 100
+    expected[500:] = 500
+    expected = expected[::-2]
+    assert dpt.all(expected == r)
+
+    x = dpt.zeros(sz, dtype=dt, sycl_queue=q)[::-2]
+    a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q)
+    a_max[::2] = -2
+    a_max = a_max[::-2]
+    r = dpt.clip(x, min=-3, max=a_max)
+    assert dpt.all(a_max == r)
+
+
+def test_clip_max_less_than_min():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4")
+    res = dpt.clip(x, 5, 0)
+    assert dpt.all(res == 0)
+
+
+@pytest.mark.parametrize("dt", ["?", "i4", "f4", "c8"])
+def test_clip_minmax_weak_types(dt):
+    get_queue_or_skip()
+
+    x = dpt.zeros(10, dtype=dt)
+    min_list = [False, 0, 0.0, 0.0 + 0.0j]
+    max_list = [True, 1, 1.0, 1.0 + 0.0j]
+
+    for min_v, max_v in zip(min_list, max_list):
+        st_dt = _strong_dtype_num_kind(dpt.dtype(dt))
+        wk_dt1 = _weak_type_num_kind(_get_dtype(min_v, x.sycl_device))
+        wk_dt2 = _weak_type_num_kind(_get_dtype(max_v, x.sycl_device))
+
+        if st_dt >= wk_dt1 and st_dt >= wk_dt2:
+            r = dpt.clip(x, min_v, max_v)
+            assert isinstance(r, dpt.usm_ndarray)
+        else:
+            with pytest.raises(ValueError):
+                dpt.clip(x, min_v, max_v)
+
+        if st_dt >= wk_dt1:
+            r = dpt.clip(x, min_v)
+            assert isinstance(r, dpt.usm_ndarray)
+
+            r = dpt.clip(x, None, min_v)
+            assert isinstance(r, dpt.usm_ndarray)
+        else:
+            with pytest.raises(ValueError):
+                dpt.clip(x, min_v)
+            with pytest.raises(ValueError):
+                dpt.clip(x, None, max_v)
+
+
+def test_clip_max_weak_type_errors():
+    get_queue_or_skip()
+
+    x = dpt.zeros(10, dtype="i4")
+    m = dpt.ones(10, dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, m, 2.5)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, 2.5, m)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, 2.5)
+
+    with pytest.raises(ValueError):
+        dpt.clip(dpt.astype(x, "?"), 2)
+
+    with pytest.raises(ValueError):
+        dpt.clip(dpt.astype(x, "f4"), complex(2))
+
+
+def test_clip_unaligned():
+    get_queue_or_skip()
+
+    x = dpt.full(513, 5, dtype="i4")
+    a_min = dpt.zeros(512, dtype="i4")
+    a_max = dpt.full(512, 2, dtype="i4")
+
+    expected = dpt.full(512, 2, dtype="i4")
+    assert dpt.all(dpt.clip(x[1:], a_min, a_max) == expected)
+
+
+def test_clip_none_args():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    r = dpt.clip(x)
+    assert dpt.all(x == r)
+
+
+def test_clip_shape_errors():
+    get_queue_or_skip()
+
+    x = dpt.ones((4, 4), dtype="i4")
+    a_min = dpt.ones(5, dtype="i4")
+    a_max = dpt.ones(5, dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, a_min, a_max)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, a_min)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, 0, 1, out=a_min)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, 0, out=a_min)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, out=a_min)
+
+
+def test_clip_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4", sycl_queue=q1)
+    a_min = dpt.ones(10, dtype="i4", sycl_queue=q2)
+    a_max = dpt.ones(10, dtype="i4", sycl_queue=q1)
+    res = dpt.empty_like(x, sycl_queue=q2)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.clip(x, a_min, a_max)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.clip(x, dpt.ones_like(x), a_max, out=res)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.clip(x, a_min)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.clip(x, None, a_max, out=res)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.clip(x, out=res)
+
+
+def test_clip_readonly_out():
+    get_queue_or_skip()
+    x = dpt.arange(32, dtype=dpt.int32)
+    r = dpt.empty_like(x)
+    r.flags["W"] = False
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, min=0, max=10, out=r)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, max=10, out=r)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, min=0, out=r)
+
+    with pytest.raises(ValueError):
+        dpt.clip(x, out=r)
+
+
+def test_clip_gh_1744():
+    get_queue_or_skip()
+    x = dpt.asarray([0, 255], dtype=dpt.uint8)
+    y = dpt.clip(x, -300, 300)
+
+    assert dpt.all(x == y)
diff --git a/dpnp/tests/tensor/test_tensor_copy_utils.py b/dpnp/tests/tensor/test_tensor_copy_utils.py
new file mode 100644
index 000000000000..878877dcaa4c
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_copy_utils.py
@@ -0,0 +1,113 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+import dpnp.tensor._copy_utils as cu
+
+from .helper import get_queue_or_skip
+
+
+def test_copy_utils_empty_like_orderK():
+    get_queue_or_skip()
+    a = dpt.empty((10, 10), dtype=dpt.int32, order="F")
+    X = cu._empty_like_orderK(a, dpt.int32, a.usm_type, a.device)
+    assert X.flags["F"]
+
+
+def test_copy_utils_empty_like_orderK_invalid_args():
+    get_queue_or_skip()
+    with pytest.raises(TypeError):
+        cu._empty_like_orderK([1, 2, 3], dpt.int32, "device", None)
+    with pytest.raises(TypeError):
+        cu._empty_like_pair_orderK(
+            [1, 2, 3],
+            (
+                1,
+                2,
+                3,
+            ),
+            dpt.int32,
+            (3,),
+            "device",
+            None,
+        )
+
+    a = dpt.empty(10, dtype=dpt.int32)
+    with pytest.raises(TypeError):
+        cu._empty_like_pair_orderK(
+            a,
+            (
+                1,
+                2,
+                3,
+            ),
+            dpt.int32,
+            (10,),
+            "device",
+            None,
+        )
+
+
+def test_copy_utils_from_numpy_empty_like_orderK():
+    q = get_queue_or_skip()
+
+    a = np.empty((10, 10), dtype=np.int32, order="C")
+    r0 = cu._from_numpy_empty_like_orderK(a, dpt.int32, "device", q)
+    assert r0.flags["C"]
+
+    b = np.empty((10, 10), dtype=np.int32, order="F")
+    r1 = cu._from_numpy_empty_like_orderK(b, dpt.int32, "device", q)
+    assert r1.flags["F"]
+
+    c = np.empty((2, 3, 4), dtype=np.int32, order="C")
+    c = np.transpose(c, (1, 0, 2))
+    r2 = cu._from_numpy_empty_like_orderK(c, dpt.int32, "device", q)
+    assert not r2.flags["C"] and not r2.flags["F"]
+
+
+def test_copy_utils_from_numpy_empty_like_orderK_invalid_args():
+    with pytest.raises(TypeError):
+        cu._from_numpy_empty_like_orderK([1, 2, 3], dpt.int32, "device", None)
+
+
+def test_gh_2055():
+    """
+    Test that `dpt.asarray` works on contiguous NumPy arrays with `order="K"`
+    when dimensions are permuted.
+
+    See: https://github.com/IntelPython/dpctl/issues/2055
+    """
+    get_queue_or_skip()
+
+    a = np.ones((2, 3, 4), dtype=dpt.int32)
+    a_t = np.transpose(a, (2, 0, 1))
+    r = dpt.asarray(a_t)
+    assert not r.flags["C"] and not r.flags["F"]
diff --git a/dpnp/tests/tensor/test_tensor_diff.py b/dpnp/tests/tensor/test_tensor_diff.py
new file mode 100644
index 000000000000..f75b9d4a3639
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_diff.py
@@ -0,0 +1,344 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from math import prod
+
+import pytest
+from numpy.testing import assert_raises_regex
+
+import dpnp.tensor as dpt
+from dpnp.tensor._type_utils import _to_device_supported_dtype
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_diff_basic(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.asarray([9, 12, 7, 17, 10, 18, 15, 9, 8, 8], dtype=dt, sycl_queue=q)
+    op = dpt.not_equal if x.dtype is dpt.bool else dpt.subtract
+
+    # test both n=2 and n>2 branches
+    for n in [1, 2, 5]:
+        res = dpt.diff(x, n=n)
+        expected_res = x
+        for _ in range(n):
+            expected_res = op(expected_res[1:], expected_res[:-1])
+        if dpt.dtype(dt).kind in "fc":
+            assert dpt.allclose(res, expected_res)
+        else:
+            assert dpt.all(res == expected_res)
+
+
+def test_diff_axis():
+    get_queue_or_skip()
+
+    x = dpt.tile(
+        dpt.asarray([9, 12, 7, 17, 10, 18, 15, 9, 8, 8], dtype="i4"), (3, 4, 1)
+    )
+    x[:, ::2, :] = 0
+
+    for n in [1, 2, 3]:
+        res = dpt.diff(x, n=n, axis=1)
+        expected_res = x
+        for _ in range(n):
+            expected_res = dpt.subtract(
+                expected_res[:, 1:, :], expected_res[:, :-1, :]
+            )
+        assert dpt.all(res == expected_res)
+
+
+def test_diff_prepend_append_type_promotion():
+    get_queue_or_skip()
+
+    dts = [
+        ("i1", "u1", "i8"),
+        ("i1", "u8", "u1"),
+        ("u4", "i4", "f4"),
+        ("i8", "c8", "u8"),
+    ]
+
+    for dt0, dt1, dt2 in dts:
+        x = dpt.ones(10, dtype=dt1)
+        prepend = dpt.full(1, 2, dtype=dt0)
+        append = dpt.full(1, 3, dtype=dt2)
+
+        res = dpt.diff(x, prepend=prepend, append=append)
+        assert res.dtype == _to_device_supported_dtype(
+            dpt.result_type(prepend, x, append),
+            x.sycl_queue.sycl_device,
+        )
+
+        res = dpt.diff(x, prepend=prepend)
+        assert res.dtype == _to_device_supported_dtype(
+            dpt.result_type(prepend, x),
+            x.sycl_queue.sycl_device,
+        )
+
+        res = dpt.diff(x, append=append)
+        assert res.dtype == _to_device_supported_dtype(
+            dpt.result_type(x, append),
+            x.sycl_queue.sycl_device,
+        )
+
+
+def test_diff_0d():
+    get_queue_or_skip()
+
+    x = dpt.ones(())
+    with pytest.raises(ValueError):
+        dpt.diff(x)
+
+
+def test_diff_empty_array():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 0, 5))
+    res = dpt.diff(x, axis=1)
+    assert res.shape == x.shape
+
+    res = dpt.diff(x, axis=0)
+    assert res.shape == (2, 0, 5)
+
+    append = dpt.ones((3, 2, 5))
+    res = dpt.diff(x, axis=1, append=append)
+    assert res.shape == (3, 1, 5)
+
+    prepend = dpt.ones((3, 2, 5))
+    res = dpt.diff(x, axis=1, prepend=prepend)
+    assert res.shape == (3, 1, 5)
+
+
+def test_diff_no_op():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="i4")
+    res = dpt.diff(x, n=0)
+    assert dpt.all(x == res)
+
+    x = dpt.reshape(x, (2, 5))
+    res = dpt.diff(x, n=0, axis=0)
+    assert dpt.all(x == res)
+
+
+@pytest.mark.parametrize("sh,axis", [((1,), 0), ((3, 4, 5), 1)])
+def test_diff_prepend_append_py_scalars(sh, axis):
+    get_queue_or_skip()
+
+    n = 1
+
+    arr = dpt.ones(sh, dtype="i4")
+    zero = 0
+
+    # first and last elements along axis
+    # will be checked for correctness
+    sl1 = [slice(None)] * arr.ndim
+    sl1[axis] = slice(1)
+    sl1 = tuple(sl1)
+
+    sl2 = [slice(None)] * arr.ndim
+    sl2[axis] = slice(-1, None, None)
+    sl2 = tuple(sl2)
+
+    r = dpt.diff(arr, axis=axis, prepend=zero, append=zero)
+    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
+    assert r.shape[axis] == arr.shape[axis] + 2 - n
+    assert dpt.all(r[sl1] == 1)
+    assert dpt.all(r[sl2] == -1)
+
+    r = dpt.diff(arr, axis=axis, prepend=zero)
+    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
+    assert r.shape[axis] == arr.shape[axis] + 1 - n
+    assert dpt.all(r[sl1] == 1)
+
+    r = dpt.diff(arr, axis=axis, append=zero)
+    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
+    assert r.shape[axis] == arr.shape[axis] + 1 - n
+    assert dpt.all(r[sl2] == -1)
+
+    r = dpt.diff(arr, axis=axis, prepend=dpt.asarray(zero), append=zero)
+    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
+    assert r.shape[axis] == arr.shape[axis] + 2 - n
+    assert dpt.all(r[sl1] == 1)
+    assert dpt.all(r[sl2] == -1)
+
+    r = dpt.diff(arr, axis=axis, prepend=zero, append=dpt.asarray(zero))
+    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
+    assert r.shape[axis] == arr.shape[axis] + 2 - n
+    assert dpt.all(r[sl1] == 1)
+    assert dpt.all(r[sl2] == -1)
+
+
+def test_tensor_diff_append_prepend_arrays():
+    get_queue_or_skip()
+
+    n = 1
+    axis = 0
+
+    for sh in [(5,), (3, 4, 5)]:
+        sz = prod(sh)
+        arr = dpt.reshape(dpt.arange(sz, 2 * sz, dtype="i4"), sh)
+        prepend = dpt.reshape(dpt.arange(sz, dtype="i4"), sh)
+        append = dpt.reshape(dpt.arange(2 * sz, 3 * sz, dtype="i4"), sh)
+        const_diff = sz / sh[axis]
+
+        r = dpt.diff(arr, axis=axis, prepend=prepend, append=append)
+        assert all(
+            r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis
+        )
+        assert (
+            r.shape[axis]
+            == arr.shape[axis] + prepend.shape[axis] + append.shape[axis] - n
+        )
+        assert dpt.all(r == const_diff)
+
+        r = dpt.diff(arr, axis=axis, prepend=prepend)
+        assert all(
+            r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis
+        )
+        assert r.shape[axis] == arr.shape[axis] + prepend.shape[axis] - n
+        assert dpt.all(r == const_diff)
+
+        r = dpt.diff(arr, axis=axis, append=append)
+        assert all(
+            r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis
+        )
+        assert r.shape[axis] == arr.shape[axis] + append.shape[axis] - n
+        assert dpt.all(r == const_diff)
+
+
+def test_diff_wrong_append_prepend_shape():
+    get_queue_or_skip()
+
+    arr = dpt.ones((3, 4, 5), dtype="i4")
+    arr_bad_sh = dpt.ones(2, dtype="i4")
+
+    assert_raises_regex(
+        ValueError,
+        ".*shape.*is invalid.*",
+        dpt.diff,
+        arr,
+        prepend=arr_bad_sh,
+        append=arr_bad_sh,
+    )
+
+    assert_raises_regex(
+        ValueError,
+        ".*shape.*is invalid.*",
+        dpt.diff,
+        arr,
+        prepend=arr,
+        append=arr_bad_sh,
+    )
+
+    assert_raises_regex(
+        ValueError,
+        ".*shape.*is invalid.*",
+        dpt.diff,
+        arr,
+        prepend=arr_bad_sh,
+    )
+
+    assert_raises_regex(
+        ValueError,
+        ".*shape.*is invalid.*",
+        dpt.diff,
+        arr,
+        append=arr_bad_sh,
+    )
+
+
+def test_diff_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+    q3 = get_queue_or_skip()
+
+    ar1 = dpt.ones(1, dtype="i4", sycl_queue=q1)
+    ar2 = dpt.ones(1, dtype="i4", sycl_queue=q2)
+    ar3 = dpt.ones(1, dtype="i4", sycl_queue=q3)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.diff(ar1, prepend=ar2, append=ar3)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.diff(ar1, prepend=ar2, append=0)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.diff(ar1, prepend=0, append=ar2)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.diff(ar1, prepend=ar2)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.diff(ar1, append=ar2)
+
+
+def test_diff_input_validation():
+    bad_in = {}
+    assert_raises_regex(
+        TypeError,
+        "Expecting dpnp.tensor.usm_ndarray type, got.*",
+        dpt.diff,
+        bad_in,
+    )
+
+
+def test_diff_positive_order():
+    get_queue_or_skip()
+
+    x = dpt.ones(1, dtype="i4")
+    n = -1
+    assert_raises_regex(
+        ValueError,
+        ".*must be positive.*",
+        dpt.diff,
+        x,
+        n=n,
+    )
diff --git a/dpnp/tests/tensor/test_tensor_dtype_routines.py b/dpnp/tests/tensor/test_tensor_dtype_routines.py
new file mode 100644
index 000000000000..588926c0d123
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_dtype_routines.py
@@ -0,0 +1,170 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+import dpctl
+import pytest
+
+import dpnp.tensor as dpt
+
+list_dtypes = [
+    "bool",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "float16",
+    "float32",
+    "float64",
+    "complex64",
+    "complex128",
+]
+
+
+dtype_categories = {
+    "bool": ["bool"],
+    "signed integer": ["int8", "int16", "int32", "int64"],
+    "unsigned integer": ["uint8", "uint16", "uint32", "uint64"],
+    "integral": [
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "uint8",
+        "uint16",
+        "uint32",
+        "uint64",
+    ],
+    "real floating": ["float16", "float32", "float64"],
+    "complex floating": ["complex64", "complex128"],
+    "numeric": [d for d in list_dtypes if d != "bool"],
+}
+
+
+@pytest.mark.parametrize("kind_str", dtype_categories.keys())
+@pytest.mark.parametrize("dtype_str", list_dtypes)
+def test_isdtype_kind_str(dtype_str, kind_str):
+    dt = dpt.dtype(dtype_str)
+    is_in_kind = dpt.isdtype(dt, kind_str)
+    expected = dtype_str in dtype_categories[kind_str]
+    assert is_in_kind == expected
+
+
+@pytest.mark.parametrize("dtype_str", list_dtypes)
+def test_isdtype_kind_tuple(dtype_str):
+    dt = dpt.dtype(dtype_str)
+    if dtype_str.startswith("bool"):
+        assert dpt.isdtype(dt, ("real floating", "bool"))
+        assert not dpt.isdtype(
+            dt, ("integral", "real floating", "complex floating")
+        )
+    elif dtype_str.startswith("int"):
+        assert dpt.isdtype(dt, ("real floating", "signed integer"))
+        assert not dpt.isdtype(
+            dt, ("bool", "unsigned integer", "real floating")
+        )
+    elif dtype_str.startswith("uint"):
+        assert dpt.isdtype(dt, ("bool", "unsigned integer"))
+        assert not dpt.isdtype(dt, ("real floating", "complex floating"))
+    elif dtype_str.startswith("float"):
+        assert dpt.isdtype(dt, ("complex floating", "real floating"))
+        assert not dpt.isdtype(dt, ("integral", "complex floating", "bool"))
+    else:
+        assert dpt.isdtype(dt, ("integral", "complex floating"))
+        assert not dpt.isdtype(dt, ("bool", "integral", "real floating"))
+
+
+@pytest.mark.parametrize("dtype_str", list_dtypes)
+def test_isdtype_kind_tuple_dtypes(dtype_str):
+    dt = dpt.dtype(dtype_str)
+    if dtype_str.startswith("bool"):
+        assert dpt.isdtype(dt, (dpt.int32, dpt.bool))
+        assert not dpt.isdtype(dt, (dpt.int16, dpt.uint32, dpt.float64))
+
+    elif dtype_str.startswith("int"):
+        assert dpt.isdtype(dt, (dpt.int8, dpt.int16, dpt.int32, dpt.int64))
+        assert not dpt.isdtype(dt, (dpt.bool, dpt.float32, dpt.complex64))
+
+    elif dtype_str.startswith("uint"):
+        assert dpt.isdtype(dt, (dpt.uint8, dpt.uint16, dpt.uint32, dpt.uint64))
+        assert not dpt.isdtype(dt, (dpt.bool, dpt.int32, dpt.float32))
+
+    elif dtype_str.startswith("float"):
+        assert dpt.isdtype(dt, (dpt.float16, dpt.float32, dpt.float64))
+        assert not dpt.isdtype(dt, (dpt.bool, dpt.complex64, dpt.int8))
+
+    else:
+        assert dpt.isdtype(dt, (dpt.complex64, dpt.complex128))
+        assert not dpt.isdtype(dt, (dpt.bool, dpt.uint64, dpt.int8))
+
+
+@pytest.mark.parametrize(
+    "kind",
+    [
+        [dpt.int32, dpt.bool],
+        "f4",
+        float,
+        123,
+        "complex",
+    ],
+)
+def test_isdtype_invalid_kind(kind):
+    with pytest.raises((TypeError, ValueError)):
+        dpt.isdtype(dpt.int32, kind)
+
+
+def test_finfo_array():
+    try:
+        x = dpt.empty(tuple(), dtype="f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("Default-selected SYCL device unavailable")
+    o = dpt.finfo(x)
+    assert o.dtype == dpt.float32
+
+
+def test_iinfo_array():
+    try:
+        x = dpt.empty(tuple(), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("Default-selected SYCL device unavailable")
+    o = dpt.iinfo(x)
+    assert o.dtype == dpt.int32
+
+
+def test_iinfo_validation():
+    with pytest.raises(ValueError):
+        dpt.iinfo("O")
+
+
+def test_finfo_validation():
+    with pytest.raises(ValueError):
+        dpt.iinfo("O")
diff --git a/dpnp/tests/tensor/test_tensor_isin.py b/dpnp/tests/tensor/test_tensor_isin.py
new file mode 100644
index 000000000000..08f1787f733f
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_isin.py
@@ -0,0 +1,281 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_numeric_dtypes = [
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+_all_dtypes = ["?"] + _numeric_dtypes
+
+
+@pytest.mark.parametrize("dtype", _numeric_dtypes)
+def test_isin_basic(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n = 100
+    x = dpt.arange(n, dtype=dtype, sycl_queue=q)
+    test = dpt.arange(n - 1, dtype=dtype, sycl_queue=q)
+    r1 = dpt.isin(x, test)
+    assert dpt.all(r1[:-1])
+    assert not r1[-1]
+    assert r1.shape == x.shape
+
+    # test with invert keyword
+    r2 = dpt.isin(x, test, invert=True)
+    assert not dpt.any(r2[:-1])
+    assert r2[-1]
+    assert r2.shape == x.shape
+
+
+def test_isin_basic_bool():
+    dt = dpt.bool
+    n = 100
+    x = dpt.zeros(n, dtype=dt)
+    x[-1] = True
+    test = dpt.zeros((), dtype=dt)
+    r1 = dpt.isin(x, test)
+    assert dpt.all(r1[:-1])
+    assert not r1[-1]
+    assert r1.shape == x.shape
+
+    r2 = dpt.isin(x, test, invert=True)
+    assert not dpt.any(r2[:-1])
+    assert r2[-1]
+    assert r2.shape == x.shape
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_isin_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, m = 100, 20
+    x = dpt.zeros((n, m), dtype=dtype, order="F", sycl_queue=q)
+    x[:, ::2] = dpt.arange(1, (m / 2) + 1, dtype=dtype, sycl_queue=q)
+    x_s = x[:, ::2]
+    test = dpt.arange(1, (m / 2), dtype=dtype, sycl_queue=q)
+    r1 = dpt.isin(x_s, test)
+    assert dpt.all(r1[:, :-1])
+    assert not dpt.any(r1[:, -1])
+    assert not dpt.any(x[:, 1::2])
+    assert r1.shape == x_s.shape
+    assert r1.flags.c_contiguous
+
+    # test with invert keyword
+    r2 = dpt.isin(x_s, test, invert=True)
+    assert not dpt.any(r2[:, :-1])
+    assert dpt.all(r2[:, -1])
+    assert not dpt.any(x[:, 1:2])
+    assert r2.shape == x_s.shape
+    assert r2.flags.c_contiguous
+
+
+def test_isin_strided_bool():
+    dt = dpt.bool
+
+    n, m = 100, 20
+    x = dpt.zeros((n, m), dtype=dt, order="F")
+    x[:, :-2:2] = True
+    x_s = x[:, ::2]
+    test = dpt.ones((), dtype=dt)
+    r1 = dpt.isin(x_s, test)
+    assert dpt.all(r1[:, :-1])
+    assert not dpt.any(r1[:, -1])
+    assert not dpt.any(x[:, 1::2])
+    assert r1.shape == x_s.shape
+    assert r1.flags.c_contiguous
+
+    # test with invert keyword
+    r2 = dpt.isin(x_s, test, invert=True)
+    assert not dpt.any(r2[:, :-1])
+    assert dpt.all(r2[:, -1])
+    assert not dpt.any(x[:, 1:2])
+    assert r2.shape == x_s.shape
+    assert r2.flags.c_contiguous
+
+
+@pytest.mark.parametrize("dt1", _numeric_dtypes)
+@pytest.mark.parametrize("dt2", _numeric_dtypes)
+def test_isin_dtype_matrix(dt1, dt2):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt1, q)
+    skip_if_dtype_not_supported(dt2, q)
+
+    sz = 10
+    x = dpt.asarray([0, 1, 11], dtype=dt1, sycl_queue=q)
+    test1 = dpt.arange(sz, dtype=dt2, sycl_queue=q)
+
+    r1 = dpt.isin(x, test1)
+    assert isinstance(r1, dpt.usm_ndarray)
+    assert r1.dtype == dpt.bool
+    assert r1.shape == x.shape
+    assert not r1[-1]
+    assert dpt.all(r1[0:-1])
+    assert r1.sycl_queue == x.sycl_queue
+
+    test2 = dpt.tile(dpt.asarray([[0, 1]], dtype=dt2, sycl_queue=q).mT, 2)
+    r2 = dpt.isin(x, test2)
+    assert isinstance(r2, dpt.usm_ndarray)
+    assert r2.dtype == dpt.bool
+    assert r2.shape == x.shape
+    assert not r2[-1]
+    assert dpt.all(r1[0:-1])
+    assert r2.sycl_queue == x.sycl_queue
+
+
+def test_isin_empty_inputs():
+    get_queue_or_skip()
+
+    x = dpt.ones((10, 0, 1), dtype="i4")
+    test = dpt.ones((), dtype="i4")
+    res1 = dpt.isin(x, test)
+    assert isinstance(res1, dpt.usm_ndarray)
+    assert res1.size == 0
+    assert res1.shape == x.shape
+    assert res1.dtype == dpt.bool
+
+    res2 = dpt.isin(x, test, invert=True)
+    assert isinstance(res2, dpt.usm_ndarray)
+    assert res2.size == 0
+    assert res2.shape == x.shape
+    assert res2.dtype == dpt.bool
+
+    x = dpt.ones((3, 3), dtype="i4")
+    test = dpt.ones(0, dtype="i4")
+    res3 = dpt.isin(x, test)
+    assert isinstance(res3, dpt.usm_ndarray)
+    assert res3.shape == x.shape
+    assert res3.dtype == dpt.bool
+    assert not dpt.all(res3)
+
+    res4 = dpt.isin(x, test, invert=True)
+    assert isinstance(res4, dpt.usm_ndarray)
+    assert res4.shape == x.shape
+    assert res4.dtype == dpt.bool
+    assert dpt.all(res4)
+
+
+def test_isin_validation():
+    get_queue_or_skip()
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.isin(1, 1)
+    not_bool = {}
+    with pytest.raises(TypeError):
+        dpt.isin(dpt.ones([1]), dpt.ones([1]), invert=not_bool)
+
+
+def test_isin_special_floating_point_vals():
+    get_queue_or_skip()
+
+    # real and complex nans compare false
+    x = dpt.asarray(dpt.nan, dtype="f4")
+    test = dpt.asarray(dpt.nan, dtype="f4")
+    assert not dpt.isin(x, test)
+
+    x = dpt.asarray(dpt.nan, dtype="c8")
+    test = dpt.asarray(dpt.nan, dtype="c8")
+    assert not dpt.isin(x, test)
+
+    # -0.0 compares equal to +0.0
+    x = dpt.asarray(-0.0, dtype="f4")
+    test = dpt.asarray(0.0, dtype="f4")
+    assert dpt.isin(x, test)
+    assert dpt.isin(test, x)
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_isin_py_scalars(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.zeros((10, 10), dtype=dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        r1 = dpt.isin(x, sc)
+        assert isinstance(r1, dpt.usm_ndarray)
+        r2 = dpt.isin(sc, x)
+        assert isinstance(r2, dpt.usm_ndarray)
+
+
+def test_isin_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.ones(10, sycl_queue=q1)
+    test = dpt.ones_like(x, sycl_queue=q2)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.isin(x, test)
diff --git a/dpnp/tests/tensor/test_tensor_statistical_functions.py b/dpnp/tests/tensor/test_tensor_statistical_functions.py
new file mode 100644
index 000000000000..7e444500d75f
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_statistical_functions.py
@@ -0,0 +1,271 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import pytest
+
+import dpnp.tensor as dpt
+from dpnp.tensor._tensor_impl import default_device_fp_type
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_no_complex_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+]
+
+
+@pytest.mark.parametrize("dt", _no_complex_dtypes)
+def test_mean_dtypes(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.ones(10, dtype=dt)
+    res = dpt.mean(x)
+    assert res == 1
+    if x.dtype.kind in "biu":
+        assert res.dtype == dpt.dtype(default_device_fp_type(q))
+    else:
+        assert res.dtype == x.dtype
+
+
+@pytest.mark.parametrize("dt", _no_complex_dtypes)
+@pytest.mark.parametrize("py_zero", [float(0), int(0)])
+def test_std_var_dtypes(dt, py_zero):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.ones(10, dtype=dt)
+    res = dpt.std(x, correction=py_zero)
+    assert res == 0
+    if x.dtype.kind in "biu":
+        assert res.dtype == dpt.dtype(default_device_fp_type(q))
+    else:
+        assert res.dtype == x.dtype
+
+    res = dpt.var(x, correction=py_zero)
+    assert res == 0
+    if x.dtype.kind in "biu":
+        assert res.dtype == dpt.dtype(default_device_fp_type(q))
+    else:
+        assert res.dtype == x.dtype
+
+
+def test_stat_fns_axis():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
+    m = dpt.mean(x, axis=(1, 2, -1))
+
+    assert isinstance(m, dpt.usm_ndarray)
+    assert m.shape == (3, 6)
+    assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype))
+
+    s = dpt.var(x, axis=(1, 2, -1))
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 6)
+    assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype))
+
+
+@pytest.mark.parametrize("fn", [dpt.mean, dpt.var])
+def test_stat_fns_empty(fn):
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="f4")
+    r = fn(x)
+    assert r.shape == ()
+    assert dpt.isnan(r)
+
+    x = dpt.empty((10, 0, 2), dtype="f4")
+    r = fn(x, axis=1)
+    assert r.shape == (10, 2)
+    assert dpt.all(dpt.isnan(r))
+
+    r = fn(x, axis=0)
+    assert r.shape == (0, 2)
+    assert r.size == 0
+
+
+def test_stat_fns_keepdims():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
+    m = dpt.mean(x, axis=(1, 2, -1), keepdims=True)
+
+    assert isinstance(m, dpt.usm_ndarray)
+    assert m.shape == (3, 1, 1, 6, 1)
+    assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype))
+
+    s = dpt.var(x, axis=(1, 2, -1), keepdims=True)
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 1, 1, 6, 1)
+    assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype))
+
+
+def test_stat_fns_empty_axis():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5))
+    m = dpt.mean(x, axis=())
+
+    assert x.shape == m.shape
+    assert dpt.all(x == m)
+
+    s = dpt.var(x, axis=())
+    assert x.shape == s.shape
+    assert dpt.all(s == 0)
+
+    d = dpt.std(x, axis=())
+    assert x.shape == d.shape
+    assert dpt.all(d == 0)
+
+
+def test_mean():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
+    m = dpt.mean(x)
+    expected = dpt.asarray(4, dtype="f4")
+    assert dpt.allclose(m, expected)
+
+    m = dpt.mean(x, axis=0)
+    expected = dpt.arange(3, 6, dtype="f4")
+    assert dpt.allclose(m, expected)
+
+    m = dpt.mean(x, axis=1)
+    expected = dpt.asarray([1, 4, 7], dtype="f4")
+    assert dpt.allclose(m, expected)
+
+
+def test_var_std():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
+    r = dpt.var(x)
+    expected = dpt.asarray(6.666666507720947, dtype="f4")
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.var(x, correction=3)
+    expected1 = dpt.asarray(10.0, dtype="f4")
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.std(x)
+    expected = dpt.sqrt(expected)
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.std(x, correction=3)
+    expected1 = dpt.sqrt(expected1)
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.var(x, axis=0)
+    expected = dpt.full(x.shape[1], 6, dtype="f4")
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.var(x, axis=0, correction=1)
+    expected1 = dpt.full(x.shape[1], 9, dtype="f4")
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.std(x, axis=0)
+    expected = dpt.sqrt(expected)
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.std(x, axis=0, correction=1)
+    expected1 = dpt.sqrt(expected1)
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.var(x, axis=1)
+    expected = dpt.full(x.shape[0], 0.6666666865348816, dtype="f4")
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.var(x, axis=1, correction=1)
+    expected1 = dpt.ones(x.shape[0], dtype="f4")
+    assert dpt.allclose(r1, expected1)
+
+    r = dpt.std(x, axis=1)
+    expected = dpt.sqrt(expected)
+    assert dpt.allclose(r, expected)
+
+    r1 = dpt.std(x, axis=1, correction=1)
+    expected1 = dpt.sqrt(expected1)
+    assert dpt.allclose(r1, expected1)
+
+
+def test_var_axis_length_correction():
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
+
+    r = dpt.var(x, correction=x.size)
+    assert dpt.isnan(r)
+
+    r = dpt.var(x, axis=0, correction=x.shape[0])
+    assert dpt.all(dpt.isnan(r))
+
+    r = dpt.var(x, axis=1, correction=x.shape[1])
+    assert dpt.all(dpt.isnan(r))
+
+
+def test_stat_function_errors():
+    d = {}
+    with pytest.raises(TypeError):
+        dpt.var(d)
+    with pytest.raises(TypeError):
+        dpt.std(d)
+    with pytest.raises(TypeError):
+        dpt.mean(d)
+
+    get_queue_or_skip()
+    x = dpt.empty(1, dtype="f4")
+    with pytest.raises(TypeError):
+        dpt.var(x, axis=d)
+    with pytest.raises(TypeError):
+        dpt.std(x, axis=d)
+    with pytest.raises(TypeError):
+        dpt.mean(x, axis=d)
+
+    with pytest.raises(TypeError):
+        dpt.var(x, correction=d)
+    with pytest.raises(TypeError):
+        dpt.std(x, correction=d)
+
+    x = dpt.empty(1, dtype="c8")
+    with pytest.raises(ValueError):
+        dpt.var(x)
+    with pytest.raises(ValueError):
+        dpt.std(x)
diff --git a/dpnp/tests/tensor/test_tensor_sum.py b/dpnp/tests/tensor/test_tensor_sum.py
new file mode 100644
index 000000000000..90e548f1b28c
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_sum.py
@@ -0,0 +1,348 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes)
+def test_sum_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    # test reduction for C-contiguous input
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.sum(m)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    if m.dtype.kind == "i":
+        assert r.dtype.kind == "i"
+    elif m.dtype.kind == "u":
+        assert r.dtype.kind == "u"
+    elif m.dtype.kind == "f":
+        assert r.dtype.kind == "f"
+    elif m.dtype.kind == "c":
+        assert r.dtype.kind == "c"
+
+    assert dpt.all(r == 100)
+
+    # test reduction for strided input
+    m = dpt.ones(200, dtype=arg_dtype)[:1:-2]
+    r = dpt.sum(m)
+    assert dpt.all(r == 99)
+
+    # test reduction for strided input which can be simplified
+    # to contiguous computation
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.sum(dpt.flip(m))
+    assert dpt.all(r == 100)
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes)
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_sum_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.sum(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+    assert dpt.all(r == 100)
+
+
+def test_sum_empty():
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="u1")
+    y = dpt.sum(x)
+    assert y.shape == ()
+    assert int(y) == 0
+
+
+def test_sum_axis():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
+    s = dpt.sum(m, axis=(1, 2, -1))
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 6)
+    assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype="i4"))
+
+
+def test_sum_keepdims():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
+    s = dpt.sum(m, axis=(1, 2, -1), keepdims=True)
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 1, 1, 6, 1)
+    assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype=s.dtype))
+
+
+def test_sum_scalar():
+    get_queue_or_skip()
+
+    m = dpt.ones(())
+    s = dpt.sum(m)
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert m.sycl_queue == s.sycl_queue
+    assert s.shape == ()
+    assert s == dpt.full((), 1)
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes)
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_sum_arg_out_dtype_scalar(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    m = dpt.ones((), dtype=arg_dtype)
+    r = dpt.sum(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+    assert r == 1
+
+
+def test_sum_keepdims_zero_size():
+    """See gh-1293"""
+    get_queue_or_skip()
+    n = 10
+    a = dpt.ones((n, 0, n))
+
+    s1 = dpt.sum(a, keepdims=True)
+    assert s1.shape == (1, 1, 1)
+
+    s2 = dpt.sum(a, axis=(0, 1), keepdims=True)
+    assert s2.shape == (1, 1, n)
+
+    s3 = dpt.sum(a, axis=(1, 2), keepdims=True)
+    assert s3.shape == (n, 1, 1)
+
+    s4 = dpt.sum(a, axis=(0, 2), keepdims=True)
+    assert s4.shape == (1, 0, 1)
+
+    a0 = a[0]
+    s5 = dpt.sum(a0, keepdims=True)
+    assert s5.shape == (1, 1)
+
+
+@pytest.mark.parametrize("arg_dtype", ["i8", "f4", "c8"])
+@pytest.mark.parametrize("n", [1023, 1024, 1025])
+def test_largish_reduction(arg_dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    m = 5
+    x = dpt.ones((m, n, m), dtype=arg_dtype)
+
+    y1 = dpt.sum(x, axis=(0, 1))
+    y2 = dpt.sum(x, axis=(1, 2))
+
+    assert dpt.all(dpt.equal(y1, y2))
+    assert dpt.all(dpt.equal(y1, n * m))
+
+
+@pytest.mark.parametrize("n", [1023, 1024, 1025])
+def test_largish_reduction_axis1_axis0(n):
+    get_queue_or_skip()
+
+    m = 25
+    x1 = dpt.ones((m, n), dtype="f4")
+    x2 = dpt.ones((n, m), dtype="f4")
+
+    y1 = dpt.sum(x1, axis=1)
+    y2 = dpt.sum(x2, axis=0)
+
+    assert dpt.all(y1 == n)
+    assert dpt.all(y2 == n)
+
+
+def test_axis0_bug():
+    "gh-1391"
+    get_queue_or_skip()
+
+    sh = (1, 2, 3)
+    a = dpt.arange(sh[0] * sh[1] * sh[2], dtype="i4")
+    a = dpt.reshape(a, sh)
+    aT = dpt.permute_dims(a, (2, 1, 0))
+
+    s = dpt.sum(aT, axis=2)
+    expected = dpt.asarray([[0, 3], [1, 4], [2, 5]])
+
+    assert dpt.all(s == expected)
+
+
+def test_sum_axis1_axis0():
+    """See gh-1455"""
+    get_queue_or_skip()
+
+    # The atomic case is checked in `test_usm_ndarray_reductions`
+    # This test checks the tree reduction path for correctness
+    x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5))
+
+    m = dpt.sum(x, axis=0)
+    expected = dpt.asarray(
+        [
+            [60, 63, 66, 69, 72],
+            [75, 78, 81, 84, 87],
+            [90, 93, 96, 99, 102],
+            [105, 108, 111, 114, 117],
+        ],
+        dtype="f4",
+    )
+    tol = dpt.finfo(m.dtype).resolution
+    assert dpt.allclose(m, expected, atol=tol, rtol=tol)
+
+    x = dpt.flip(x, axis=2)
+    m = dpt.sum(x, axis=2)
+    expected = dpt.asarray(
+        [[10, 35, 60, 85], [110, 135, 160, 185], [210, 235, 260, 285]],
+        dtype="f4",
+    )
+    assert dpt.allclose(m, expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes[1:])
+def test_prod_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    arg_dtype = dpt.dtype(arg_dtype)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.prod(m)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    if m.dtype.kind == "i":
+        assert r.dtype.kind == "i"
+    elif m.dtype.kind == "u":
+        assert r.dtype.kind == "u"
+    elif m.dtype.kind == "f":
+        assert r.dtype.kind == "f"
+    elif m.dtype.kind == "c":
+        assert r.dtype.kind == "c"
+    assert dpt.all(r == 1)
+
+    if dpt.isdtype(m.dtype, "unsigned integer"):
+        m = dpt.tile(dpt.arange(1, 3, dtype=arg_dtype), 10)[:1:-2]
+        r = dpt.prod(m)
+        assert dpt.all(r == dpt.asarray(512, dtype=r.dtype))
+    else:
+        m = dpt.full(200, -1, dtype=arg_dtype)[:1:-2]
+        r = dpt.prod(m)
+        assert dpt.all(r == dpt.asarray(-1, dtype=r.dtype))
+
+
+def test_prod_empty():
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="u1")
+    y = dpt.prod(x)
+    assert y.shape == ()
+    assert int(y) == 1
+
+
+def test_prod_axis():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
+    s = dpt.prod(m, axis=(1, 2, -1))
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 6)
+    assert dpt.all(s == dpt.asarray(1, dtype="i4"))
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes)
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_prod_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    out_dtype = dpt.dtype(out_dtype)
+    arg_dtype = dpt.dtype(arg_dtype)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.prod(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+    assert dpt.all(r == 1)
+
+
+def test_gh_1468():
+    "See https://github.com/IntelPython/dpctl/issues/1468"
+    get_queue_or_skip()
+
+    a = dpt.full((2, 3, 4), 123456789, dtype=dpt.int32)
+    t = dpt.sum(a, dtype="f4")
+    assert t > 0
+
+
+@pytest.mark.parametrize(
+    "dt", ["i1", "i2", "i4", "i8", "f2", "f4", "f8", "c8", "c16"]
+)
+def test_gh_1944(dt):
+    "See https://github.com/IntelPython/dpctl/issues/1944"
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+    x = dpt.asarray([-1, 1], dtype=dpt.dtype(dt), sycl_queue=q)
+    r = dpt.sum(x, dtype="?")
+    # reduction must be performed in the requested dtype
+    # if performed in the input type, result is False
+    assert r
diff --git a/dpnp/tests/tensor/test_tensor_testing.py b/dpnp/tests/tensor/test_tensor_testing.py
new file mode 100644
index 000000000000..34cc40987354
--- /dev/null
+++ b/dpnp/tests/tensor/test_tensor_testing.py
@@ -0,0 +1,181 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_allclose(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    a1 = dpt.ones(10, dtype=dtype)
+    a2 = dpt.ones(10, dtype=dtype)
+
+    assert dpt.allclose(a1, a2)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_allclose_real_fp(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    v = [dpt.nan, -dpt.nan, dpt.inf, -dpt.inf, -0.0, 0.0, 1.0, -1.0]
+    a1 = dpt.asarray(v[2:], dtype=dtype)
+    a2 = dpt.asarray(v[2:], dtype=dtype)
+
+    tol = dpt.finfo(a1.dtype).resolution
+    assert dpt.allclose(a1, a2, atol=tol, rtol=tol)
+
+    a1 = dpt.asarray(v, dtype=dtype)
+    a2 = dpt.asarray(v, dtype=dtype)
+
+    assert not dpt.allclose(a1, a2, atol=tol, rtol=tol)
+    assert dpt.allclose(a1, a2, atol=tol, rtol=tol, equal_nan=True)
+
+
+@pytest.mark.parametrize("dtype", ["c8", "c16"])
+def test_allclose_complex_fp(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    v = [dpt.nan, -dpt.nan, dpt.inf, -dpt.inf, -0.0, 0.0, 1.0, -1.0]
+
+    not_nans = [complex(*xy) for xy in itertools.product(v[2:], repeat=2)]
+    z1 = dpt.asarray(not_nans, dtype=dtype)
+    z2 = dpt.asarray(not_nans, dtype=dtype)
+
+    tol = dpt.finfo(z1.dtype).resolution
+    assert dpt.allclose(z1, z2, atol=tol, rtol=tol)
+
+    both = [complex(*xy) for xy in itertools.product(v, repeat=2)]
+    z1 = dpt.asarray(both, dtype=dtype)
+    z2 = dpt.asarray(both, dtype=dtype)
+
+    tol = dpt.finfo(z1.dtype).resolution
+    assert not dpt.allclose(z1, z2, atol=tol, rtol=tol)
+    assert dpt.allclose(z1, z2, atol=tol, rtol=tol, equal_nan=True)
+
+
+def test_allclose_validation():
+    with pytest.raises(TypeError):
+        dpt.allclose(True, False)
+
+    get_queue_or_skip()
+    x = dpt.asarray(True)
+    with pytest.raises(TypeError):
+        dpt.allclose(x, False)
+
+
+def test_allclose_type_promotion():
+    get_queue_or_skip()
+
+    x1 = dpt.ones(10, dtype="i4")
+    x2 = dpt.ones(10, dtype="i8")
+
+    assert dpt.allclose(x1, x2)
+
+
+def test_allclose_tolerance():
+    get_queue_or_skip()
+
+    x = dpt.zeros(10, dtype="f4")
+    atol = 1e-5
+    y = dpt.full_like(x, atol)
+    assert dpt.allclose(x, y, atol=atol, rtol=0)
+
+    # about 8e-6
+    tol = float.fromhex("0x1.0p-17")
+    x = dpt.ones(10, dtype="f4")
+    y = x - tol
+    assert dpt.allclose(x, y, atol=0, rtol=tol)
+
+
+def test_allclose_real_fp_early_exists():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([0.0, dpt.inf, -dpt.inf], dtype="f4")
+    x2 = dpt.asarray([dpt.inf, 0.0, -dpt.inf], dtype="f4")
+
+    # early exists, inf positions are different
+    assert not dpt.allclose(x1, x2)
+
+    x2 = dpt.asarray([0.0, -dpt.inf, dpt.inf], dtype="f4")
+
+    # early exists, inf positions are the same, but signs differ
+    assert not dpt.allclose(x1, x2)
+
+
+def test_allclose_complex_fp_early_exists():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([0.0, dpt.inf, -dpt.inf], dtype="c8")
+    x2 = dpt.asarray([dpt.inf, 0.0, -dpt.inf], dtype="c8")
+
+    # early exists, inf positions of real parts are different
+    assert not dpt.allclose(x1, x2)
+
+    x2 = dpt.asarray([0.0, -dpt.inf, dpt.inf], dtype="c8")
+
+    # early exists, inf positions of real parts are the same, but signs differ
+    assert not dpt.allclose(x1, x2)
+
+    x1 = dpt.asarray([0.0, dpt.inf * 1j, -dpt.inf * 1j], dtype="c8")
+    x2 = dpt.asarray([dpt.inf * 1j, 0.0, -dpt.inf * 1j], dtype="c8")
+
+    # early exists, inf positions of imag parts are different
+    assert not dpt.allclose(x1, x2)
+
+    x2 = dpt.asarray([0.0, -dpt.inf * 1j, dpt.inf * 1j], dtype="c8")
+    assert not dpt.allclose(x1, x2)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_ctor.py b/dpnp/tests/tensor/test_usm_ndarray_ctor.py
new file mode 100644
index 000000000000..70066860b19f
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_ctor.py
@@ -0,0 +1,2324 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+import numbers
+from math import prod
+
+import dpctl
+import dpctl.memory as dpm
+import numpy as np
+import pytest
+from numpy.testing import assert_raises_regex
+
+import dpnp.tensor as dpt
+from dpnp.tensor import Device
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "b1",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (),
+        (4,),
+        (0,),
+        (0, 1),
+        (0, 0),
+        (4, 5),
+        (2, 5, 2),
+        (2, 2, 2, 2, 2, 2, 2, 2),
+        5,
+        np.int32(7),
+    ],
+)
+@pytest.mark.parametrize("usm_type", ["shared", "host", "device"])
+def test_allocate_usm_ndarray(shape, usm_type):
+    q = get_queue_or_skip()
+    X = dpt.usm_ndarray(
+        shape, dtype="i8", buffer=usm_type, buffer_ctor_kwargs={"queue": q}
+    )
+    Xnp = np.ndarray(shape, dtype="i8")
+    assert X.usm_type == usm_type
+    assert X.sycl_context == q.sycl_context
+    assert X.sycl_device == q.sycl_device
+    assert X.size == Xnp.size
+    assert X.shape == Xnp.shape
+    assert X.shape == X.__sycl_usm_array_interface__["shape"]
+
+
+def test_usm_ndarray_flags():
+    get_queue_or_skip()
+    f = dpt.usm_ndarray((5,), dtype="i4").flags
+    assert f.fc
+    assert f.forc
+
+    f = dpt.usm_ndarray((5, 2), dtype="i4").flags
+    assert f.c_contiguous
+    assert f.forc
+
+    f = dpt.usm_ndarray((5, 2), dtype="i4", order="F").flags
+    assert f.f_contiguous
+    assert f.forc
+    assert f.fnc
+
+    f = dpt.usm_ndarray((5,), dtype="i4", strides=(1,)).flags
+    assert f.fc
+    assert f.forc
+
+    f = dpt.usm_ndarray((5, 1, 2), dtype="i4", strides=(2, 0, 1)).flags
+    assert f.c_contiguous
+    assert f.forc
+
+    f = dpt.usm_ndarray((5, 1, 2), dtype="i4", strides=(1, 0, 5)).flags
+    assert f.f_contiguous
+    assert f.forc
+    assert f.fnc
+
+    f = dpt.usm_ndarray((5, 0, 1), dtype="i4", strides=(1, 0, 1)).flags
+    assert f.fc
+    assert f.forc
+    assert not dpt.usm_ndarray(
+        (5, 1, 1), dtype="i4", strides=(2, 0, 1)
+    ).flags.forc
+
+    x = dpt.empty(5, dtype="u2")
+    assert x.flags.writable is True
+    x.flags.writable = False
+    assert x.flags.writable is False
+    with pytest.raises(ValueError):
+        x[:] = 0
+    x.flags["W"] = True
+    assert x.flags.writable is True
+    x.flags["WRITABLE"] = True
+    assert x.flags.writable is True
+    x[:] = 0
+
+    with pytest.raises(TypeError):
+        x.flags.writable = {}
+    with pytest.raises(ValueError):
+        x.flags["C"] = False
+
+
+def test_usm_ndarray_flags_bug_gh_1334():
+    get_queue_or_skip()
+    a = dpt.ones((2, 3), dtype="u4")
+    r = dpt.reshape(a, (1, 6, 1))
+    assert r.flags["C"] and r.flags["F"]
+
+    a = dpt.ones((2, 3), dtype="u4", order="F")
+    r = dpt.reshape(a, (1, 6, 1), order="F")
+    assert r.flags["C"] and r.flags["F"]
+
+    a = dpt.ones((2, 3, 4), dtype="i8")
+    r = dpt.sum(a, axis=(1, 2), keepdims=True)
+    assert r.flags["C"] and r.flags["F"]
+
+    a = dpt.ones((2, 1), dtype="?")
+    r = a[:, 1::-1]
+    assert r.flags["F"] and r.flags["C"]
+
+
+def test_usm_ndarray_writable_flag_views():
+    get_queue_or_skip()
+    a = dpt.arange(10, dtype="f4")
+    a.flags["W"] = False
+
+    a.shape = (5, 2)
+    assert not a.flags.writable
+    assert not a.T.flags.writable
+    assert not a.mT.flags.writable
+    assert not a.real.flags.writable
+    assert not a[0:3].flags.writable
+
+    a = dpt.arange(10, dtype="c8")
+    a.flags["W"] = False
+
+    assert not a.real.flags.writable
+    assert not a.imag.flags.writable
+
+
+@pytest.mark.parametrize("dt1", _all_dtypes)
+@pytest.mark.parametrize("dt2", _all_dtypes)
+def test_usm_ndarray_from_zero_sized_usm_ndarray(dt1, dt2):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt1, q)
+    skip_if_dtype_not_supported(dt2, q)
+
+    x1 = dpt.ones((0,), dtype=dt1, sycl_queue=q)
+    x2 = dpt.usm_ndarray(x1.shape, dtype=dt2, buffer=x1)
+    assert x2.dtype == dt2
+    assert x2.sycl_queue == q
+    assert x2._pointer == x1._pointer
+    assert x2.shape == x1.shape
+
+
+def test_usm_ndarray_from_usm_ndarray_readonly():
+    get_queue_or_skip()
+
+    x1 = dpt.arange(10, dtype="f4")
+    x1.flags["W"] = False
+    x2 = dpt.usm_ndarray(x1.shape, dtype="f4", buffer=x1)
+    assert not x2.flags.writable
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes
+    + [
+        b"float32",
+        dpt.dtype("d"),
+        np.half,
+    ],
+)
+def test_dtypes(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    Xusm = dpt.usm_ndarray((1,), dtype=dtype)
+    assert Xusm.itemsize == dpt.dtype(dtype).itemsize
+    expected_fmt = (dpt.dtype(dtype).str)[1:]
+    actual_fmt = Xusm.__sycl_usm_array_interface__["typestr"][1:]
+    assert expected_fmt == actual_fmt
+
+
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+@pytest.mark.parametrize("buffer_ctor_kwargs", [dict(), {"queue": None}])
+def test_default_dtype(usm_type, buffer_ctor_kwargs):
+    q = get_queue_or_skip()
+    dev = q.get_sycl_device()
+    if buffer_ctor_kwargs:
+        buffer_ctor_kwargs["queue"] = q
+    Xusm = dpt.usm_ndarray(
+        (1,), buffer=usm_type, buffer_ctor_kwargs=buffer_ctor_kwargs
+    )
+    if dev.has_aspect_fp64:
+        expected_dtype = "f8"
+    else:
+        expected_dtype = "f4"
+    assert Xusm.itemsize == dpt.dtype(expected_dtype).itemsize
+    expected_fmt = (dpt.dtype(expected_dtype).str)[1:]
+    actual_fmt = Xusm.__sycl_usm_array_interface__["typestr"][1:]
+    assert expected_fmt == actual_fmt
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "",
+        ">f4",
+        "invalid",
+        123,
+        np.dtype(">f4"),
+        np.dtype([("a", ">f4"), ("b", "i4")]),
+    ],
+)
+def test_dtypes_invalid(dtype):
+    with pytest.raises((TypeError, ValueError)):
+        dpt.usm_ndarray((1,), dtype=dtype)
+
+
+@pytest.mark.parametrize("dt", ["f", "c8"])
+def test_properties(dt):
+    """
+    Test that properties execute
+    """
+    try:
+        X = dpt.usm_ndarray((3, 4, 5), dtype=dt)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert isinstance(X.sycl_queue, dpctl.SyclQueue)
+    assert isinstance(X.sycl_device, dpctl.SyclDevice)
+    assert isinstance(X.sycl_context, dpctl.SyclContext)
+    assert isinstance(X.dtype, dpt.dtype)
+    assert isinstance(X.__sycl_usm_array_interface__, dict)
+    assert isinstance(X.mT, dpt.usm_ndarray)
+    assert isinstance(X.imag, dpt.usm_ndarray)
+    assert isinstance(X.real, dpt.usm_ndarray)
+    assert isinstance(X.shape, tuple)
+    assert isinstance(X.strides, tuple)
+    assert X.usm_type in ("shared", "device", "host")
+    assert isinstance(X.size, numbers.Integral)
+    assert isinstance(X.nbytes, numbers.Integral)
+    assert isinstance(X.ndim, numbers.Integral)
+    assert isinstance(X._pointer, numbers.Integral)
+    assert isinstance(X.device, Device)
+    with pytest.raises(ValueError):
+        # array-API mandates exception for .ndim != 2
+        X.T
+    Y = dpt.usm_ndarray((2, 3), dtype=dt)
+    assert isinstance(Y.mT, dpt.usm_ndarray)
+    V = dpt.usm_ndarray((3,), dtype=dt)
+    with pytest.raises(ValueError):
+        # array-API mandates exception for .ndim != 2
+        V.mT
+
+
+@pytest.mark.parametrize("shape", [tuple(), (1,), (1, 1), (1, 1, 1)])
+@pytest.mark.parametrize("dtype", ["|b1", "|u2", "|f4", "|i8"])
+class TestCopyScalar:
+    @pytest.mark.parametrize("func", [bool, float, int, complex])
+    def test_copy_scalar_with_func(self, func, shape, dtype):
+        try:
+            X = dpt.usm_ndarray(shape, dtype=dtype)
+        except dpctl.SyclDeviceCreationError:
+            pytest.skip("No SYCL devices available")
+        Y = np.arange(1, X.size + 1, dtype=dtype)
+        X.usm_data.copy_from_host(Y.view("|u1"))
+        Y = Y.reshape(())
+        # Non-0D numeric arrays must not be convertible to Python scalars
+        if len(shape) != 0:
+            assert_raises_regex(TypeError, "only 0-dimensional arrays", func, X)
+        else:
+            # 0D arrays are allowed to convert
+            assert func(X) == func(Y)
+
+    @pytest.mark.parametrize(
+        "method", ["__bool__", "__float__", "__int__", "__complex__"]
+    )
+    def test_copy_scalar_with_method(self, method, shape, dtype):
+        try:
+            X = dpt.usm_ndarray(shape, dtype=dtype)
+        except dpctl.SyclDeviceCreationError:
+            pytest.skip("No SYCL devices available")
+        Y = np.arange(1, X.size + 1, dtype=dtype)
+        X.usm_data.copy_from_host(Y.view("|u1"))
+        Y = Y.reshape(())
+        if len(shape) != 0:
+            assert_raises_regex(
+                TypeError, "only 0-dimensional arrays", getattr(X, method)
+            )
+        else:
+            assert getattr(X, method)() == getattr(Y, method)()
+
+
+@pytest.mark.parametrize("func", [bool, float, int, complex])
+@pytest.mark.parametrize("shape", [(2,), (1, 2), (3, 4, 5), (0,)])
+def test_copy_scalar_invalid_shape(func, shape):
+    try:
+        X = dpt.usm_ndarray(shape, dtype="i8")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        func(X)
+
+
+def test_index_noninteger():
+    import operator
+
+    try:
+        X = dpt.usm_ndarray(1, "f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(IndexError):
+        operator.index(X)
+
+
+@pytest.mark.parametrize(
+    "ind",
+    [
+        tuple(),
+        (None,),
+        (
+            None,
+            Ellipsis,
+            None,
+        ),
+        (2, 2, None, 3, 4),
+        (Ellipsis,),
+        (None, slice(0, None, 2), Ellipsis, slice(0, None, 3)),
+        (None, slice(1, None, 2), Ellipsis, slice(1, None, 3)),
+        (None, slice(None, -1, -2), Ellipsis, slice(2, None, 3)),
+        (
+            slice(None, None, -1),
+            slice(None, None, -1),
+            slice(0, None, 3),
+            slice(1, None, 2),
+        ),
+    ],
+)
+def test_basic_slice(ind):
+    try:
+        X = dpt.usm_ndarray((2 * 3, 2 * 4, 3 * 5, 2 * 7), dtype="u1")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Xnp = np.empty(X.shape, dtype=X.dtype)
+    S = X[ind]
+    Snp = Xnp[ind]
+    assert S.shape == Snp.shape
+    assert S.strides == Snp.strides
+    assert S.dtype == X.dtype
+
+
+def test_empty_slice():
+    # see gh801
+    try:
+        X = dpt.empty((1, 0, 1), dtype="u1")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Y = X[:, ::-1, :]
+    assert Y.shape == X.shape
+    Z = X[:, ::2, :]
+    assert Z.shape == X.shape
+    X = dpt.empty(0)
+    Y = X[::-1]
+    assert Y.shape == X.shape
+    Z = X[::2]
+    assert Z.shape == X.shape
+    X = dpt.empty((0, 4), dtype="u1")
+    assert X[:, 1].shape == (0,)
+    assert X[:, 1:3].shape == (0, 2)
+
+
+def test_slice_constructor_1d():
+    Xh = np.arange(37, dtype="i4")
+    try:
+        Xusm = dpt.arange(Xh.size, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    for ind in [
+        slice(1, None, 2),
+        slice(0, None, 3),
+        slice(1, None, 3),
+        slice(2, None, 3),
+        slice(None, None, -1),
+        slice(-2, 2, -2),
+        slice(-1, 1, -2),
+        slice(None, None, -13),
+    ]:
+        assert np.array_equal(
+            dpt.asnumpy(Xusm[ind]), Xh[ind]
+        ), "Failed for {}".format(ind)
+
+
+def test_slice_constructor_3d():
+    Xh = np.ones((37, 24, 35), dtype="i4")
+    try:
+        Xusm = dpt.ones(Xh.shape, dtype=Xh.dtype)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    for ind in [
+        slice(1, None, 2),
+        slice(0, None, 3),
+        slice(1, None, 3),
+        slice(2, None, 3),
+        slice(None, None, -1),
+        slice(-2, 2, -2),
+        slice(-1, 1, -2),
+        slice(None, None, -13),
+        (slice(None, None, -2), Ellipsis, None, 15),
+    ]:
+        assert np.array_equal(
+            dpt.to_numpy(Xusm[ind]), Xh[ind]
+        ), "Failed for {}".format(ind)
+
+
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+def test_slice_suai(usm_type):
+    Xh = np.arange(0, 10, dtype="u1")
+    try:
+        Xusm = dpt.arange(0, 10, dtype="u1", usm_type=usm_type)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    for ind in [slice(2, 3, None), slice(5, 7, None), slice(3, 9, None)]:
+        assert np.array_equal(
+            dpm.as_usm_memory(Xusm[ind]).copy_to_host(), Xh[ind]
+        ), "Failed for {}".format(ind)
+
+
+def test_slicing_basic():
+    try:
+        Xusm = dpt.usm_ndarray((10, 5), dtype="c8")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Xusm[None]
+    Xusm[...]
+    Xusm[8]
+    Xusm[-3]
+    with pytest.raises(IndexError):
+        Xusm[..., ...]
+    with pytest.raises(IndexError):
+        Xusm[1, 1, :, 1]
+    Xusm[:, -4]
+    with pytest.raises(IndexError):
+        Xusm[:, -128]
+    with pytest.raises(IndexError):
+        Xusm[{1, 2, 3, 4, 5, 6, 7}]
+    X = dpt.usm_ndarray(10, "u1")
+    X.usm_data.copy_from_host(b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09")
+    int(
+        X[X[2]]
+    )  # check that objects with __index__ method can be used as indices
+    Xh = dpm.as_usm_memory(X[X[2] : X[5]]).copy_to_host()
+    Xnp = np.arange(0, 10, dtype="u1")
+    assert np.array_equal(Xh, Xnp[Xnp[2] : Xnp[5]])
+
+
+def test_slicing_empty():
+    try:
+        X = dpt.usm_ndarray((0, 10), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    x = dpt.moveaxis(X, 1, 0)
+    # this used to raise ValueError
+    y = x[1]
+    assert y.ndim == 1
+    assert y.shape == (0,)
+    assert y.dtype == X.dtype
+    assert y.usm_type == X.usm_type
+    assert y.sycl_queue == X.sycl_queue
+    w = x[1:3]
+    assert w.ndim == 2
+    assert w.shape == (
+        2,
+        0,
+    )
+    assert w.dtype == X.dtype
+    assert w.usm_type == X.usm_type
+    assert w.sycl_queue == X.sycl_queue
+
+
+def test_ctor_invalid_shape():
+    with pytest.raises(TypeError):
+        dpt.usm_ndarray(dict())
+
+
+def test_ctor_invalid_order():
+    get_queue_or_skip()
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray((5, 5, 3), order="Z")
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray((10), strides=(1,), order="Z")
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray((), order="Z")
+
+
+def test_ctor_buffer_kwarg():
+    try:
+        dpt.usm_ndarray(10, dtype="i8", buffer=b"device")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray(10, buffer="invalid_param")
+    Xusm = dpt.usm_ndarray((10, 5), dtype="c8")
+    Xusm[...] = 1
+    X2 = dpt.usm_ndarray(Xusm.shape, buffer=Xusm, dtype=Xusm.dtype)
+    Horig_copy = Xusm.usm_data.copy_to_host()
+    H2_copy = X2.usm_data.copy_to_host()
+    assert np.array_equal(Horig_copy, H2_copy)
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray(10, dtype="i4", buffer=dict())
+    # use device-specific default fp data type
+    X3 = dpt.usm_ndarray(Xusm.shape, buffer=Xusm)
+    assert np.array_equal(Horig_copy, X3.usm_data.copy_to_host())
+
+
+def test_usm_ndarray_props():
+    try:
+        Xusm = dpt.usm_ndarray((10, 5), dtype="c8", order="F")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Xusm.ndim
+    repr(Xusm)
+    Xusm.flags
+    Xusm.__sycl_usm_array_interface__
+    Xusm.device
+    Xusm.strides
+    Xusm.real
+    Xusm.imag
+    try:
+        dpctl.SyclQueue("cpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Sycl device CPU was not detected")
+    Xusm.to_device("cpu")
+
+
+def test_datapi_device():
+    try:
+        X = dpt.usm_ndarray(1, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    dev_t = type(X.device)
+    with pytest.raises(TypeError):
+        dev_t()
+    dev_t.create_device(X.device)
+    dev_t.create_device(X.sycl_queue)
+    d1 = dev_t.create_device(X.sycl_device)
+    d2 = dev_t.create_device(X.sycl_device.filter_string)
+    d3 = dev_t.create_device(None)
+    assert d1.sycl_queue == d2.sycl_queue
+    assert d1.sycl_queue == d3.sycl_queue
+    X.device.sycl_context
+    X.device.sycl_queue
+    X.device.sycl_device
+    repr(X.device)
+    X.device.print_device_info()
+
+
+def _pyx_capi_int(X, pyx_capi_name, caps_name=b"int", val_restype=ctypes.c_int):
+    import sys
+
+    mod = sys.modules[X.__class__.__module__]
+    cap = mod.__pyx_capi__.get(pyx_capi_name, None)
+    if cap is None:
+        raise ValueError(
+            "__pyx_capi__ does not export {} capsule".format(pyx_capi_name)
+        )
+    # construct Python callable to invoke these functions
+    cap_ptr_fn = ctypes.pythonapi.PyCapsule_GetPointer
+    cap_ptr_fn.restype = ctypes.c_void_p
+    cap_ptr_fn.argtypes = [ctypes.py_object, ctypes.c_char_p]
+    cap_ptr = cap_ptr_fn(cap, caps_name)
+    val_ptr = ctypes.cast(cap_ptr, ctypes.POINTER(val_restype))
+    return val_ptr.contents.value
+
+
+def test_pyx_capi_check_constants():
+    try:
+        X = dpt.usm_ndarray(17, dtype="i1")[1::2]
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    cc_flag = _pyx_capi_int(X, "USM_ARRAY_C_CONTIGUOUS")
+    assert cc_flag > 0 and 0 == (cc_flag & (cc_flag - 1))
+    fc_flag = _pyx_capi_int(X, "USM_ARRAY_F_CONTIGUOUS")
+    assert fc_flag > 0 and 0 == (fc_flag & (fc_flag - 1))
+    w_flag = _pyx_capi_int(X, "USM_ARRAY_WRITABLE")
+    assert w_flag > 0 and 0 == (w_flag & (w_flag - 1))
+
+    bool_typenum = _pyx_capi_int(X, "UAR_BOOL")
+    assert bool_typenum == dpt.dtype("bool_").num
+
+    byte_typenum = _pyx_capi_int(X, "UAR_BYTE")
+    assert byte_typenum == dpt.dtype(np.byte).num
+    ubyte_typenum = _pyx_capi_int(X, "UAR_UBYTE")
+    assert ubyte_typenum == dpt.dtype(np.ubyte).num
+
+    short_typenum = _pyx_capi_int(X, "UAR_SHORT")
+    assert short_typenum == dpt.dtype(np.short).num
+    ushort_typenum = _pyx_capi_int(X, "UAR_USHORT")
+    assert ushort_typenum == dpt.dtype(np.ushort).num
+
+    int_typenum = _pyx_capi_int(X, "UAR_INT")
+    assert int_typenum == dpt.dtype(np.intc).num
+    uint_typenum = _pyx_capi_int(X, "UAR_UINT")
+    assert uint_typenum == dpt.dtype(np.uintc).num
+
+    long_typenum = _pyx_capi_int(X, "UAR_LONG")
+    assert long_typenum == dpt.dtype("l").num
+    ulong_typenum = _pyx_capi_int(X, "UAR_ULONG")
+    assert ulong_typenum == dpt.dtype("L").num
+
+    longlong_typenum = _pyx_capi_int(X, "UAR_LONGLONG")
+    assert longlong_typenum == dpt.dtype(np.longlong).num
+    ulonglong_typenum = _pyx_capi_int(X, "UAR_ULONGLONG")
+    assert ulonglong_typenum == dpt.dtype(np.ulonglong).num
+
+    half_typenum = _pyx_capi_int(X, "UAR_HALF")
+    assert half_typenum == dpt.dtype(np.half).num
+    float_typenum = _pyx_capi_int(X, "UAR_FLOAT")
+    assert float_typenum == dpt.dtype(np.single).num
+    double_typenum = _pyx_capi_int(X, "UAR_DOUBLE")
+    assert double_typenum == dpt.dtype(np.double).num
+
+    cfloat_typenum = _pyx_capi_int(X, "UAR_CFLOAT")
+    assert cfloat_typenum == dpt.dtype(np.csingle).num
+    cdouble_typenum = _pyx_capi_int(X, "UAR_CDOUBLE")
+    assert cdouble_typenum == dpt.dtype(np.cdouble).num
+
+
+@pytest.mark.parametrize(
+    "shape", [tuple(), (1,), (5,), (2, 3), (2, 3, 4), (2, 2, 2, 2, 2)]
+)
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+def test_tofrom_numpy(shape, dtype, usm_type):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    Xusm = dpt.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=q)
+    Ynp = np.ones(shape, dtype=dtype)
+    Ynp[(0,) * len(shape)] = 0
+    ind = (slice(None, None, None),) * Ynp.ndim
+    Xusm[ind] = Ynp
+    assert np.array_equal(dpt.to_numpy(Xusm), Ynp)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+def test_tofrom_numpy_permuted(dtype, usm_type):
+    shape = (3, 5, 7)
+    perm = (1, 2, 0)
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    Xusm = dpt.permute_dims(
+        dpt.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=q), perm
+    )
+    Ynp = np.transpose(np.ones(shape, dtype=dtype), perm)
+    Ynp[:, ::2, ::2] = 0
+    ind = (slice(None, None, None),) * Ynp.ndim
+    # even though Xusm and Ynp are strided, simple memcpy could be done.
+    # This test validates that it is being done correctly
+    Xusm[ind] = Ynp
+    assert np.array_equal(dpt.to_numpy(Xusm), Ynp)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("src_usm_type", ["device", "shared", "host"])
+@pytest.mark.parametrize("dst_usm_type", ["device", "shared", "host"])
+def test_setitem_same_dtype(dtype, src_usm_type, dst_usm_type):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    shape = (2, 4, 3)
+    Xnp = (
+        np.random.randint(-10, 10, size=prod(shape))
+        .astype(dtype)
+        .reshape(shape)
+    )
+    X = dpt.from_numpy(Xnp, usm_type=src_usm_type)
+    Z = dpt.zeros(shape, dtype=dtype, usm_type=dst_usm_type)
+    Zusm_0d = dpt.copy(Z[0, 0, 0])
+    ind = (-1, -1, -1)
+    Xusm_0d = X[ind]
+    Zusm_0d[Ellipsis] = Xusm_0d
+    assert np.array_equal(dpt.to_numpy(Zusm_0d), Xnp[ind])
+    Zusm_1d = dpt.copy(Z[0, 1:3, 0])
+    ind = (-1, slice(0, 2, None), -1)
+    Xusm_1d = X[ind]
+    Zusm_1d[Ellipsis] = Xusm_1d
+    assert np.array_equal(dpt.to_numpy(Zusm_1d), Xnp[ind])
+    Zusm_2d = dpt.copy(Z[:, 1:3, 0])[::-1]
+    Xusm_2d = X[:, 1:4, -1]
+    Zusm_2d[:] = Xusm_2d[:, 0:2]
+    assert np.array_equal(dpt.to_numpy(Zusm_2d), Xnp[:, 1:3, -1])
+    Zusm_3d = dpt.copy(Z)
+    Xusm_3d = X
+    Zusm_3d[:] = Xusm_3d
+    assert np.array_equal(dpt.to_numpy(Zusm_3d), Xnp)
+    Zusm_3d[::-1] = Xusm_3d[::-1]
+    assert np.array_equal(dpt.to_numpy(Zusm_3d), Xnp)
+    Zusm_3d[:] = Xusm_3d[0]
+    R1 = dpt.to_numpy(Zusm_3d)
+    R2 = np.broadcast_to(Xnp[0], R1.shape)
+    assert R1.shape == R2.shape
+    assert np.allclose(R1, R2)
+    Zusm_empty = Zusm_1d[0:0]
+    Zusm_empty[Ellipsis] = Zusm_3d[0, 0, 0:0]
+
+
+def test_setitem_broadcasting():
+    "See gh-1503"
+    get_queue_or_skip()
+    dst = dpt.ones((2, 3, 4), dtype="u4")
+    src = dpt.zeros((3, 1), dtype=dst.dtype)
+    dst[...] = src
+    expected = np.zeros(dst.shape, dtype=dst.dtype)
+    assert np.array_equal(dpt.asnumpy(dst), expected)
+
+
+def test_setitem_broadcasting_offset():
+    get_queue_or_skip()
+    dt = dpt.int32
+    x = dpt.asarray([[1, 2, 3], [6, 7, 8]], dtype=dt)
+    y = dpt.asarray([4, 5], dtype=dt)
+    x[0] = y[1]
+    expected = dpt.asarray([[5, 5, 5], [6, 7, 8]], dtype=dt)
+    assert dpt.all(x == expected)
+
+
+def test_setitem_broadcasting_empty_dst_validation():
+    "Broadcasting rules apply, except exception"
+    get_queue_or_skip()
+    dst = dpt.ones((2, 0, 5, 4), dtype="i8")
+    src = dpt.ones((2, 0, 3, 4), dtype="i8")
+    with pytest.raises(ValueError):
+        dst[...] = src
+
+
+def test_setitem_broadcasting_empty_dst_edge_case():
+    """RHS is shunken to empty array by
+    broadasting rule, hence no exception"""
+    get_queue_or_skip()
+    dst = dpt.ones(1, dtype="i8")[0:0]
+    src = dpt.ones(tuple(), dtype="i8")
+    dst[...] = src
+
+
+def test_setitem_broadcasting_src_ndim_equal_dst_ndim():
+    get_queue_or_skip()
+    dst = dpt.ones((2, 3, 4), dtype="i4")
+    src = dpt.zeros((2, 1, 4), dtype="i4")
+    dst[...] = src
+
+    expected = np.zeros(dst.shape, dtype=dst.dtype)
+    assert np.array_equal(dpt.asnumpy(dst), expected)
+
+
+def test_setitem_broadcasting_src_ndim_greater_than_dst_ndim():
+    get_queue_or_skip()
+    dst = dpt.ones((2, 3, 4), dtype="i4")
+    src = dpt.zeros((1, 2, 1, 4), dtype="i4")
+    dst[...] = src
+
+    expected = np.zeros(dst.shape, dtype=dst.dtype)
+    assert np.array_equal(dpt.asnumpy(dst), expected)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+def test_setitem_scalar(dtype, usm_type):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.usm_ndarray((6, 6), dtype=dtype, buffer=usm_type)
+    for i in range(X.size):
+        X[np.unravel_index(i, X.shape)] = np.asarray(i, dtype=dtype)
+    assert np.array_equal(
+        dpt.to_numpy(X), np.arange(X.size).astype(dtype).reshape(X.shape)
+    )
+    Y = dpt.usm_ndarray((2, 3), dtype=dtype, buffer=usm_type)
+    for i in range(Y.size):
+        Y[np.unravel_index(i, Y.shape)] = i
+    assert np.array_equal(
+        dpt.to_numpy(Y), np.arange(Y.size).astype(dtype).reshape(Y.shape)
+    )
+
+
+def test_setitem_errors():
+    q = get_queue_or_skip()
+    X = dpt.empty((4,), dtype="u1", sycl_queue=q)
+    Y = dpt.empty((4, 2), dtype="u1", sycl_queue=q)
+    with pytest.raises(ValueError):
+        X[:] = Y
+    with pytest.raises(ValueError):
+        X[:] = Y[:, 0:1]
+    X[:] = Y[None, :, 0]
+
+
+@pytest.mark.parametrize("src_dt,dst_dt", [("i4", "i8"), ("f4", "f8")])
+def test_setitem_different_dtypes(src_dt, dst_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dst_dt, q)
+    X = dpt.ones(10, dtype=src_dt, sycl_queue=q)
+    Y = dpt.zeros(10, dtype=src_dt, sycl_queue=q)
+    Z = dpt.empty((20,), dtype=dst_dt, sycl_queue=q)
+    Z[::2] = X
+    Z[1::2] = Y
+    assert np.allclose(dpt.asnumpy(Z), np.tile(np.array([1, 0], Z.dtype), 10))
+
+
+def test_setitem_wingaps():
+    q = get_queue_or_skip()
+    if dpt.dtype("intc").itemsize == dpt.dtype("int32").itemsize:
+        dpt_dst = dpt.empty(4, dtype="int32", sycl_queue=q)
+        np_src = np.arange(4, dtype="intc")
+        dpt_dst[:] = np_src  # should not raise exceptions
+        assert np.array_equal(dpt.asnumpy(dpt_dst), np_src)
+    if dpt.dtype("long").itemsize == dpt.dtype("longlong").itemsize:
+        dpt_dst = dpt.empty(4, dtype="longlong", sycl_queue=q)
+        np_src = np.arange(4, dtype="long")
+        dpt_dst[:] = np_src  # should not raise exceptions
+        assert np.array_equal(dpt.asnumpy(dpt_dst), np_src)
+
+
+def test_shape_setter():
+    def cc_strides(sh):
+        return np.empty(sh, dtype="u1").strides
+
+    def relaxed_strides_equal(st1, st2, sh):
+        eq_ = True
+        for s1, s2, d in zip(st1, st2, sh):
+            eq_ = eq_ and ((d == 1) or (s1 == s2))
+        return eq_
+
+    sh_s = (2 * 3 * 4 * 5,)
+    sh_f = (
+        2,
+        3,
+        4,
+        5,
+    )
+    try:
+        X = dpt.usm_ndarray(sh_s, dtype="i8")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X.shape = sh_f
+    assert X.shape == sh_f
+    assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f)
+    assert X.flags.c_contiguous, "reshaped array expected to be C-contiguous"
+
+    sh_s = (
+        2,
+        12,
+        5,
+    )
+    sh_f = (
+        2,
+        3,
+        4,
+        5,
+    )
+    X = dpt.usm_ndarray(sh_s, dtype="u4", order="C")
+    X.shape = sh_f
+    assert X.shape == sh_f
+    assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f)
+
+    sh_s = (2, 3, 4, 5)
+    sh_f = (4, 3, 2, 5)
+    X = dpt.usm_ndarray(sh_s, dtype="f4")
+    X.shape = sh_f
+    assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f)
+
+    sh_s = (2, 3, 4, 5)
+    sh_f = (4, 3, 1, 2, 5)
+    X = dpt.usm_ndarray(sh_s, dtype="?")
+    X.shape = sh_f
+    assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f)
+    sz = X.size
+    X.shape = sz
+    assert X.shape == (sz,)
+    assert relaxed_strides_equal(X.strides, (1,), (sz,))
+
+    X = dpt.usm_ndarray(sh_s, dtype="u4")
+    with pytest.raises(TypeError):
+        X.shape = "abcbe"
+    X = dpt.usm_ndarray((4, 4), dtype="u1")[::2, ::2]
+    with pytest.raises(AttributeError):
+        X.shape = (4,)
+    X = dpt.usm_ndarray((0,), dtype="i4")
+    X.shape = (0,)
+    X.shape = (
+        2,
+        0,
+    )
+    X.shape = (
+        0,
+        2,
+    )
+    X.shape = (
+        1,
+        0,
+        1,
+    )
+
+
+def test_len():
+    try:
+        X = dpt.usm_ndarray(1, "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert len(X) == 1
+    X = dpt.usm_ndarray((2, 1), "i4")
+    assert len(X) == 2
+    X = dpt.usm_ndarray(tuple(), "i4")
+    with pytest.raises(TypeError):
+        len(X)
+
+
+def test_array_namespace():
+    try:
+        X = dpt.usm_ndarray(1, "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X.__array_namespace__()
+    X._set_namespace(dpt)
+    assert X.__array_namespace__() is dpt
+    X.__array_namespace__(api_version=dpt.__array_api_version__)
+    assert X.__array_namespace__() is dpt
+
+
+def test_dlpack():
+    try:
+        X = dpt.usm_ndarray(1, "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X.__dlpack_device__()
+    X.__dlpack__(stream=None)
+
+
+def test_to_device():
+    try:
+        X = dpt.usm_ndarray(1, "f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    for dev in dpctl.get_devices():
+        if dev.default_selector_score > 0:
+            Y = X.to_device(dev)
+            assert Y.sycl_device == dev
+
+
+def test_to_device_stream_validation():
+    try:
+        X = dpt.usm_ndarray(1, "f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    # invalid type of stream keyword
+    with pytest.raises(TypeError):
+        X.to_device(X.sycl_queue, stream=dict())
+    # stream is keyword-only arg
+    with pytest.raises(TypeError):
+        X.to_device(X.sycl_queue, X.sycl_queue)
+
+
+def test_to_device_stream_use():
+    try:
+        X = dpt.usm_ndarray(1, "f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    q1 = dpctl.SyclQueue(
+        X.sycl_context, X.sycl_device, property="enable_profiling"
+    )
+    X.to_device(q1, stream=q1)
+
+
+def test_to_device_migration():
+    q1 = get_queue_or_skip()  # two distinct copies of default-constructed queue
+    q2 = get_queue_or_skip()
+    X1 = dpt.empty((5,), dtype="i8", sycl_queue=q1)  # X1 is associated with q1
+    X2 = X1.to_device(q2)  # X2 is reassociated with q2
+    assert X1.sycl_queue == q1
+    assert X2.sycl_queue == q2
+    assert X1.usm_data._pointer == X2.usm_data._pointer
+
+
+def test_astype():
+    try:
+        X = dpt.empty((5, 5), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X[:] = np.full((5, 5), 7, dtype="i4")
+    Y = dpt.astype(X, "c8", order="C")
+    assert np.allclose(dpt.to_numpy(Y), np.full((5, 5), 7, dtype="c8"))
+    if Y.sycl_device.has_aspect_fp16:
+        Y = dpt.astype(X[::2, ::-1], "f2", order="K")
+        assert np.allclose(dpt.to_numpy(Y), np.full(Y.shape, 7, dtype="f2"))
+    Y = dpt.astype(X[::2, ::-1], "f4", order="K")
+    assert np.allclose(dpt.to_numpy(Y), np.full(Y.shape, 7, dtype="f4"))
+    Y = dpt.astype(X[::2, ::-1], "i4", order="K", copy=False)
+    assert Y.usm_data is X.usm_data
+    Y = dpt.astype(X, None, order="K")
+    if X.sycl_queue.sycl_device.has_aspect_fp64:
+        assert Y.dtype is dpt.float64
+    else:
+        assert Y.dtype is dpt.float32
+
+
+def test_astype_invalid_order():
+    try:
+        X = dpt.usm_ndarray(5, "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        dpt.astype(X, "i4", order="WRONG")
+
+
+def test_astype_device():
+    get_queue_or_skip()
+    q1 = dpctl.SyclQueue()
+    q2 = dpctl.SyclQueue()
+
+    x = dpt.arange(5, dtype="i4", sycl_queue=q1)
+    r = dpt.astype(x, "f4")
+    assert r.sycl_queue == x.sycl_queue
+    assert r.sycl_device == x.sycl_device
+
+    r = dpt.astype(x, "f4", device=q2)
+    assert r.sycl_queue == q2
+
+
+def test_astype_gh_1926():
+    get_queue_or_skip()
+
+    x = dpt.ones(64)
+    x_ = dpt.astype(x, x.dtype, copy=False, order="C")
+    assert x is x_
+
+    x__ = dpt.astype(x, x.dtype, copy=False, order="F")
+    assert x is x__
+
+
+def test_astype_gh_2121():
+    get_queue_or_skip()
+
+    x_np = np.asarray([0, 3, 1, 2, 0, 1], dtype="u1").view("?")
+    x = dpt.asarray(x_np)
+    res = dpt.astype(x, dpt.uint8)
+    expected = dpt.asarray([0, 1, 1, 1, 0, 1], dtype="u1")
+    assert dpt.all(res == expected)
+
+
+def test_copy():
+    try:
+        X = dpt.usm_ndarray((5, 5), "i4")[2:4, 1:4]
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X[:] = 42
+    Yc = dpt.copy(X, order="C")
+    Yf = dpt.copy(X, order="F")
+    Ya = dpt.copy(X, order="A")
+    Yk = dpt.copy(X, order="K")
+    assert Yc.usm_data is not X.usm_data
+    assert Yf.usm_data is not X.usm_data
+    assert Ya.usm_data is not X.usm_data
+    assert Yk.usm_data is not X.usm_data
+    assert Yc.strides == (3, 1)
+    assert Yf.strides == (1, 2)
+    assert Ya.strides == (3, 1)
+    assert Yk.strides == (3, 1)
+    ref = np.full(X.shape, 42, dtype=X.dtype)
+    assert np.array_equal(dpt.asnumpy(Yc), ref)
+    assert np.array_equal(dpt.asnumpy(Yf), ref)
+    assert np.array_equal(dpt.asnumpy(Ya), ref)
+    assert np.array_equal(dpt.asnumpy(Yk), ref)
+
+
+def test_copy_unaligned():
+    get_queue_or_skip()
+
+    x = dpt.ones(513, dtype="i4")
+    r = dpt.astype(x[1:], "f4")
+
+    assert dpt.all(r == 1)
+
+
+def test_ctor_invalid():
+    try:
+        m = dpm.MemoryUSMShared(12)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray((4,), dtype="i4", buffer=m)
+    m = dpm.MemoryUSMShared(64)
+    with pytest.raises(ValueError):
+        dpt.usm_ndarray((4,), dtype="u1", buffer=m, strides={"not": "valid"})
+
+
+def test_reshape():
+    try:
+        X = dpt.usm_ndarray((5, 5), "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    # can be done as views
+    Y = dpt.reshape(X, (25,))
+    assert Y.shape == (25,)
+    Z = X[::2, ::2]
+    # requires a copy
+    W = dpt.reshape(Z, (Z.size,), order="F")
+    assert W.shape == (Z.size,)
+    with pytest.raises(TypeError):
+        dpt.reshape("invalid")
+    with pytest.raises(ValueError):
+        dpt.reshape(Z, (2, 2, 2, 2, 2))
+    with pytest.raises(ValueError):
+        dpt.reshape(Z, Z.shape, order="invalid")
+    W = dpt.reshape(Z, (-1,), order="C")
+    assert W.shape == (Z.size,)
+
+    X = dpt.usm_ndarray((1,), dtype="i8")
+    Y = dpt.reshape(X, X.shape)
+    assert Y.flags == X.flags
+
+    A = dpt.usm_ndarray((0,), "i4")
+    A1 = dpt.reshape(A, (0,))
+    assert A1.shape == (0,)
+    requested_shape = (
+        2,
+        0,
+    )
+    A2 = dpt.reshape(A, requested_shape)
+    assert A2.shape == requested_shape
+    requested_shape = (
+        0,
+        2,
+    )
+    A3 = dpt.reshape(A, requested_shape)
+    assert A3.shape == requested_shape
+    requested_shape = (
+        1,
+        0,
+        2,
+    )
+    A4 = dpt.reshape(A, requested_shape)
+    assert A4.shape == requested_shape
+
+
+def test_reshape_orderF():
+    try:
+        a = dpt.arange(6 * 3 * 4, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    b = dpt.reshape(a, (6, 2, 6))
+    c = dpt.reshape(b, (9, 8), order="F")
+    assert c.flags.f_contiguous
+    assert c._pointer != b._pointer
+    assert b._pointer == a._pointer
+
+    a_np = np.arange(6 * 3 * 4, dtype="i4")
+    b_np = np.reshape(a_np, (6, 2, 6))
+    c_np = np.reshape(b_np, (9, 8), order="F")
+    assert np.array_equal(c_np, dpt.asnumpy(c))
+
+
+def test_reshape_noop():
+    """Per gh-1664"""
+    try:
+        a = dpt.ones((2, 1))
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    b = dpt.reshape(a, (2, 1))
+    assert b is a
+
+
+def test_reshape_zero_size():
+    try:
+        a = dpt.empty((0,))
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        dpt.reshape(a, (-1, 0))
+
+
+def test_reshape_large_ndim():
+    ndim = 32
+    idx = tuple(1 if i + 1 < ndim else ndim for i in range(ndim))
+    try:
+        d = dpt.ones(ndim, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    d = dpt.reshape(d, idx)
+    assert d.shape == idx
+
+
+def test_reshape_copy_kwrd():
+    try:
+        X = dpt.usm_ndarray((2, 3), "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    new_shape = (6,)
+    Z = dpt.reshape(X, new_shape, copy=True)
+    assert Z.shape == new_shape
+    assert Z.usm_data is not X.usm_data
+    X = dpt.usm_ndarray((3, 3), "i4")[::2, ::2]
+    new_shape = (4,)
+    with pytest.raises(ValueError):
+        Z = dpt.reshape(X, new_shape, copy=False)
+    with pytest.raises(ValueError):
+        invalid = Ellipsis
+        Z = dpt.reshape(X, new_shape, copy=invalid)
+
+
+def test_transpose():
+    n, m = 2, 3
+    try:
+        X = dpt.usm_ndarray((n, m), "f4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Xnp = np.arange(n * m, dtype="f4").reshape((n, m))
+    X[:] = Xnp
+    assert np.array_equal(dpt.to_numpy(X.T), Xnp.T)
+    assert np.array_equal(dpt.to_numpy(X[1:].T), Xnp[1:].T)
+
+
+def test_real_imag_views():
+    n, m = 2, 3
+    try:
+        X = dpt.usm_ndarray((n, m), "c8")
+        X_scalar = dpt.usm_ndarray((), dtype="c8")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    Xnp_r = np.arange(n * m, dtype="f4").reshape((n, m))
+    Xnp_i = np.arange(n * m, 2 * n * m, dtype="f4").reshape((n, m))
+    Xnp = Xnp_r + 1j * Xnp_i
+    X[:] = Xnp
+    X_real = X.real
+    X_imag = X.imag
+    assert np.array_equal(dpt.to_numpy(X_real), Xnp.real)
+    assert np.array_equal(dpt.to_numpy(X.imag), Xnp.imag)
+    assert not X_real.flags["C"] and not X_real.flags["F"]
+    assert not X_imag.flags["C"] and not X_imag.flags["F"]
+    assert X_real.strides == X_imag.strides
+    assert np.array_equal(dpt.to_numpy(X[1:].real), Xnp[1:].real)
+    assert np.array_equal(dpt.to_numpy(X[1:].imag), Xnp[1:].imag)
+
+    X_scalar[...] = complex(n * m, 2 * n * m)
+    assert X_scalar.real and X_scalar.imag
+
+    # check that _zero_like works for scalars
+    X_scalar = dpt.usm_ndarray((), dtype="f4")
+    assert isinstance(X_scalar.imag, dpt.usm_ndarray)
+    assert not X_scalar.imag
+    assert X_scalar.real.sycl_queue == X_scalar.imag.sycl_queue
+
+
+def test_real_imag_views_fp16():
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dpt.float16, q)
+
+    X = dpt.usm_ndarray(
+        (3, 4), dtype=dpt.float16, buffer_ctor_kwargs={"queue": q}
+    )
+    assert isinstance(X.real, dpt.usm_ndarray) and isinstance(
+        X.imag, dpt.usm_ndarray
+    )
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+def test_zeros(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.zeros(10, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(dpt.asnumpy(X), np.zeros(10, dtype=dtype))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+def test_ones(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.ones(10, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(dpt.asnumpy(X), np.ones(10, dtype=dtype))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    _all_dtypes,
+)
+def test_full(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    X = dpt.full(10, 4, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(dpt.asnumpy(X), np.full(10, 4, dtype=dtype))
+
+
+def test_full_cmplx128():
+    q = get_queue_or_skip()
+    dtype = "c16"
+    skip_if_dtype_not_supported(dtype, q)
+    fill_v = 1 + 1j
+    X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(
+        dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
+    )
+    fill_v = 0 + 1j
+    X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(
+        dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
+    )
+    fill_v = 0 + 0j
+    X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(
+        dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
+    )
+
+
+def test_full_dtype_inference():
+    try:
+        X = dpt.full(10, 4)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert np.issubdtype(X.dtype, np.integer)
+    try:
+        X = dpt.full(10, True)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert X.dtype is dpt.dtype(np.bool_)
+    assert np.issubdtype(dpt.full(10, 12.3).dtype, np.floating)
+    try:
+        X = dpt.full(10, 0.3 - 2j)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    cdt = X.dtype
+    assert np.issubdtype(cdt, np.complexfloating)
+
+    assert np.issubdtype(dpt.full(10, 12.3, dtype=int).dtype, np.integer)
+    assert np.issubdtype(dpt.full(10, 0.3 - 2j, dtype=int).dtype, np.integer)
+    rdt = np.finfo(cdt).dtype
+    assert np.issubdtype(dpt.full(10, 0.3 - 2j, dtype=rdt).dtype, np.floating)
+
+
+@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
+def test_full_special_fp(dt):
+    """See gh-1314"""
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    ar = dpt.full(10, fill_value=dpt.nan)
+    err_msg = f"Failed for fill_value=dpt.nan and dtype {dt}"
+    assert dpt.isnan(ar[0]), err_msg
+
+    ar = dpt.full(10, fill_value=dpt.inf)
+    err_msg = f"Failed for fill_value=dpt.inf and dtype {dt}"
+    assert dpt.isinf(ar[0]) and dpt.greater(ar[0], 0), err_msg
+
+    ar = dpt.full(10, fill_value=-dpt.inf)
+    err_msg = f"Failed for fill_value=-dpt.inf and dtype {dt}"
+    assert dpt.isinf(ar[0]) and dpt.less(ar[0], 0), err_msg
+
+    ar = dpt.full(10, fill_value=dpt.pi)
+    err_msg = f"Failed for fill_value=dpt.pi and dtype {dt}"
+    check = abs(float(ar[0]) - dpt.pi) < 16 * dpt.finfo(ar.dtype).eps
+    assert check, err_msg
+
+
+def test_full_fill_array():
+    q = get_queue_or_skip()
+
+    Xnp = np.array([1, 2, 3], dtype="i4")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    shape = (3, 3)
+    Y = dpt.full(shape, X)
+    Ynp = np.full(shape, Xnp)
+
+    assert Y.dtype == Ynp.dtype
+    assert Y.usm_type == "device"
+    assert np.array_equal(dpt.asnumpy(Y), Ynp)
+
+
+def test_full_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    X = dpt.arange(10, dtype="i4", sycl_queue=q1, usm_type="shared")
+    Y = dpt.full(10, X[3])
+
+    assert Y.dtype == X.dtype
+    assert Y.usm_type == X.usm_type
+    assert dpt.get_execution_queue((Y.sycl_queue, X.sycl_queue))
+    assert np.array_equal(dpt.asnumpy(Y), np.full(10, 3, dtype="i4"))
+
+    Y = dpt.full(10, X[3], dtype="f4", sycl_queue=q2, usm_type="host")
+
+    assert Y.dtype == dpt.dtype("f4")
+    assert Y.usm_type == "host"
+    assert dpt.get_execution_queue((Y.sycl_queue, q2))
+    assert np.array_equal(dpt.asnumpy(Y), np.full(10, 3, dtype="f4"))
+
+
+@pytest.mark.parametrize("order1", ["F", "C"])
+@pytest.mark.parametrize("order2", ["F", "C"])
+def test_full_order(order1, order2):
+    q = get_queue_or_skip()
+    Xnp = np.array([1, 2, 3], order=order1)
+    Ynp = np.full((3, 3), Xnp, order=order2)
+    Y = dpt.full((3, 3), Xnp, order=order2, sycl_queue=q)
+    assert Y.flags.c_contiguous == Ynp.flags.c_contiguous
+    assert Y.flags.f_contiguous == Ynp.flags.f_contiguous
+    assert np.array_equal(dpt.asnumpy(Y), Ynp)
+
+
+def test_full_strides():
+    q = get_queue_or_skip()
+    X = dpt.full((3, 3), dpt.arange(3, dtype="i4"), sycl_queue=q)
+    Xnp = np.full((3, 3), np.arange(3, dtype="i4"))
+    assert X.strides == tuple(el // Xnp.itemsize for el in Xnp.strides)
+    assert np.array_equal(dpt.asnumpy(X), Xnp)
+
+    X = dpt.full((3, 3), dpt.arange(6, dtype="i4")[::2], sycl_queue=q)
+    Xnp = np.full((3, 3), np.arange(6, dtype="i4")[::2])
+    assert X.strides == tuple(el // Xnp.itemsize for el in Xnp.strides)
+    assert np.array_equal(dpt.asnumpy(X), Xnp)
+
+
+@pytest.mark.parametrize("dt", ["i1", "u1", "i2", "u2", "i4", "u4", "i8", "u8"])
+def test_full_gh_1230(dt):
+    get_queue_or_skip()
+    dtype = dpt.dtype(dt)
+    dt_maxint = dpt.iinfo(dtype).max
+
+    if (dtype.itemsize < 8) and (np.lib.NumpyVersion(np.__version__) < "2.0.0"):
+        try:
+            X = dpt.full(1, fill_value=(dt_maxint + 1), dtype=dt)
+        except OverflowError:
+            pytest.skip("Expected OverflowError raised")
+        Y = dpt.full_like(X, fill_value=dpt.iinfo(dt).min)
+        assert dpt.all(X == Y)
+    else:
+        with pytest.raises(OverflowError):
+            dpt.full(1, dt_maxint + 1, dtype=dt)
+
+
+@pytest.mark.parametrize(
+    "dt",
+    _all_dtypes[1:],
+)
+def test_arange(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+    X = dpt.arange(0, 123, dtype=dt, sycl_queue=q)
+    dt = dpt.dtype(dt)
+    if np.issubdtype(dt, np.integer):
+        assert int(X[47]) == 47
+    elif np.issubdtype(dt, np.floating):
+        assert float(X[47]) == 47.0
+    elif np.issubdtype(dt, np.complexfloating):
+        assert complex(X[47]) == 47.0 + 0.0j
+
+    # choose size larger than maximal value that u1/u2 can accommodate
+    sz = int(dpt.iinfo(dpt.int8).max)
+    X1 = dpt.arange(sz + 1, dtype=dt, sycl_queue=q)
+    assert X1.shape == (sz + 1,)
+
+    X2 = dpt.arange(sz, 0, -1, dtype=dt, sycl_queue=q)
+    assert X2.shape == (sz,)
+
+
+def test_arange_fp():
+    q = get_queue_or_skip()
+
+    assert dpt.arange(7, 0, -2, dtype="f4", device=q).shape == (4,)
+    assert dpt.arange(0, 1, 0.25, dtype="f4", device=q).shape == (4,)
+
+    has_fp64 = q.sycl_device.has_aspect_fp64
+    if has_fp64:
+        assert dpt.arange(7, 0, -2, dtype="f8", device=q).shape == (4,)
+    assert dpt.arange(0, 1, 0.25, dtype="f4", device=q).shape == (4,)
+
+    x = dpt.arange(9.7, stop=10, sycl_queue=q)
+    assert x.shape == (1,)
+    assert x.dtype == dpt.float64 if has_fp64 else dpt.float32
+
+
+def test_arange_step_None():
+    q = get_queue_or_skip()
+
+    x = dpt.arange(0, stop=10, step=None, dtype="int32", sycl_queue=q)
+    assert x.shape == (10,)
+
+
+def test_arange_bool():
+    q = get_queue_or_skip()
+
+    x = dpt.arange(0, stop=2, dtype="bool", sycl_queue=q)
+    assert x.shape == (2,)
+    assert x.dtype == dpt.bool
+
+
+def test_arange_mixed_types():
+    q = get_queue_or_skip()
+
+    x = dpt.arange(-2.5, stop=200, step=100, dtype="int32", sycl_queue=q)
+    assert x.shape[0] == 3
+    assert int(x[1]) == 99 + int(x[0])
+
+    x = dpt.arange(+2.5, stop=200, step=100, dtype="int32", device=x.device)
+    assert x.shape[0] == 2
+    assert int(x[1]) == 100 + int(x[0])
+
+    _stop = np.float32(504)
+    x = dpt.arange(0, stop=_stop, step=100, dtype="f4", device=x.device)
+    assert x.shape == (6,)
+
+    # ensure length is determined using uncast parameters
+    x = dpt.arange(-5, stop=10**2, step=2.7, dtype="int64", device=x.device)
+    assert x.shape == (39,)
+
+
+@pytest.mark.parametrize(
+    "dt",
+    _all_dtypes,
+)
+def test_linspace(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+    X = dpt.linspace(0, 1, num=2, dtype=dt, sycl_queue=q)
+    assert np.allclose(dpt.asnumpy(X), np.linspace(0, 1, num=2, dtype=dt))
+
+
+def test_linspace_fp():
+    q = get_queue_or_skip()
+    n = 16
+    X = dpt.linspace(0, n - 1, num=n, sycl_queue=q)
+    if q.sycl_device.has_aspect_fp64:
+        assert X.dtype == dpt.dtype("float64")
+    else:
+        assert X.dtype == dpt.dtype("float32")
+    assert X.shape == (n,)
+    assert X.strides == (1,)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
+def test_linspace_fp_max(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+    n = 16
+    dt = dpt.dtype(dtype)
+    max_ = dpt.finfo(dt).max
+    X = dpt.linspace(max_, max_, endpoint=True, num=n, dtype=dt, sycl_queue=q)
+    assert X.shape == (n,)
+    assert X.strides == (1,)
+    assert np.allclose(
+        dpt.asnumpy(X), np.linspace(max_, max_, endpoint=True, num=n, dtype=dt)
+    )
+
+
+def test_linspace_int():
+    q = get_queue_or_skip()
+    X = dpt.linspace(0.1, 9.1, 11, endpoint=True, dtype=int, sycl_queue=q)
+    Xnp = np.linspace(0.1, 9.1, 11, endpoint=True, dtype=int)
+    assert np.array_equal(dpt.asnumpy(X), Xnp)
+
+
+@pytest.mark.parametrize(
+    "dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize(
+    "usm_kind",
+    [
+        "shared",
+        "device",
+        "host",
+    ],
+)
+def test_empty_like(dt, usm_kind):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.empty_like(X)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+
+    X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.empty_like(X)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+
+
+def test_empty_unexpected_data_type():
+    with pytest.raises(TypeError):
+        try:
+            dpt.empty(1, dtype=np.object_)
+        except dpctl.SyclDeviceCreationError:
+            pytest.skip("No SYCL devices available")
+
+
+@pytest.mark.parametrize(
+    "dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize(
+    "usm_kind",
+    [
+        "shared",
+        "device",
+        "host",
+    ],
+)
+def test_zeros_like(dt, usm_kind):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.zeros_like(X)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+    assert np.allclose(dpt.asnumpy(Y), np.zeros(X.shape, dtype=X.dtype))
+
+    X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.zeros_like(X)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+    assert np.array_equal(dpt.asnumpy(Y), np.zeros(X.shape, dtype=X.dtype))
+
+
+@pytest.mark.parametrize(
+    "dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize(
+    "usm_kind",
+    [
+        "shared",
+        "device",
+        "host",
+    ],
+)
+def test_ones_like(dt, usm_kind):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.ones_like(X)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+    assert np.allclose(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype))
+
+    X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.ones_like(X)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+    assert np.array_equal(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype))
+
+
+@pytest.mark.parametrize(
+    "dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize(
+    "usm_kind",
+    [
+        "shared",
+        "device",
+        "host",
+    ],
+)
+def test_full_like(dt, usm_kind):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    fill_v = dpt.dtype(dt).type(1)
+    X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.full_like(X, fill_v)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+    assert np.allclose(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype))
+
+    X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q)
+    Y = dpt.full_like(X, fill_v)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    assert X.usm_type == Y.usm_type
+    assert X.sycl_queue == Y.sycl_queue
+    assert np.array_equal(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+@pytest.mark.parametrize("usm_kind", ["shared", "device", "host"])
+def test_eye(dtype, usm_kind):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.eye(4, 5, k=1, dtype=dtype, usm_type=usm_kind, sycl_queue=q)
+    Xnp = np.eye(4, 5, k=1, dtype=dtype)
+    assert X.dtype == Xnp.dtype
+    assert np.array_equal(Xnp, dpt.asnumpy(X))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_tril(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    shape = (2, 3, 4, 5, 5)
+    X = dpt.reshape(dpt.arange(prod(shape), dtype=dtype, sycl_queue=q), shape)
+    Y = dpt.tril(X)
+    Xnp = np.arange(prod(shape), dtype=dtype).reshape(shape)
+    Ynp = np.tril(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_triu(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    shape = (4, 5)
+    X = dpt.reshape(dpt.arange(prod(shape), dtype=dtype, sycl_queue=q), shape)
+    Y = dpt.triu(X, k=1)
+    Xnp = np.arange(prod(shape), dtype=dtype).reshape(shape)
+    Ynp = np.triu(Xnp, k=1)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("tri_fn", [dpt.tril, dpt.triu])
+@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
+def test_tri_usm_type(tri_fn, usm_type):
+    q = get_queue_or_skip()
+    dtype = dpt.uint16
+
+    shape = (2, 3, 4, 5, 5)
+    size = prod(shape)
+    X = dpt.reshape(
+        dpt.arange(size, dtype=dtype, usm_type=usm_type, sycl_queue=q), shape
+    )
+    Y = tri_fn(X)  # main execution branch
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == q
+    Y = tri_fn(X, k=-6)  # special case of Y == X
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == q
+    Y = tri_fn(X, k=6)  # special case of Y == 0
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == q
+
+
+def test_tril_slice():
+    q = get_queue_or_skip()
+
+    shape = (6, 10)
+    X = dpt.reshape(dpt.arange(prod(shape), dtype="int", sycl_queue=q), shape)[
+        1:, ::-2
+    ]
+    Y = dpt.tril(X)
+    Xnp = np.arange(prod(shape), dtype="int").reshape(shape)[1:, ::-2]
+    Ynp = np.tril(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_triu_permute_dims():
+    q = get_queue_or_skip()
+
+    shape = (2, 3, 4, 5)
+    X = dpt.permute_dims(
+        dpt.reshape(dpt.arange(prod(shape), dtype="int", sycl_queue=q), shape),
+        (3, 2, 1, 0),
+    )
+    Y = dpt.triu(X)
+    Xnp = np.transpose(
+        np.arange(prod(shape), dtype="int").reshape(shape), (3, 2, 1, 0)
+    )
+    Ynp = np.triu(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_tril_broadcast_to():
+    q = get_queue_or_skip()
+
+    shape = (5, 5)
+    X = dpt.broadcast_to(dpt.ones((1), dtype="int", sycl_queue=q), shape)
+    Y = dpt.tril(X)
+    Xnp = np.broadcast_to(np.ones((1), dtype="int"), shape)
+    Ynp = np.tril(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_triu_bool():
+    q = get_queue_or_skip()
+
+    shape = (4, 5)
+    X = dpt.ones((shape), dtype="bool", sycl_queue=q)
+    Y = dpt.triu(X)
+    Xnp = np.ones((shape), dtype="bool")
+    Ynp = np.triu(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("order", ["F", "C"])
+@pytest.mark.parametrize("k", [-10, -2, -1, 3, 4, 10])
+def test_triu_order_k(order, k):
+    q = get_queue_or_skip()
+
+    shape = (3, 3)
+    X = dpt.reshape(
+        dpt.arange(prod(shape), dtype="int", sycl_queue=q),
+        shape,
+        order=order,
+    )
+    Y = dpt.triu(X, k=k)
+    Xnp = np.arange(prod(shape), dtype="int").reshape(shape, order=order)
+    Ynp = np.triu(Xnp, k=k)
+    assert Y.dtype == Ynp.dtype
+    assert X.flags == Y.flags
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("order", ["F", "C"])
+@pytest.mark.parametrize("k", [-10, -4, -3, 1, 2, 10])
+def test_tril_order_k(order, k):
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    shape = (3, 3)
+    X = dpt.reshape(
+        dpt.arange(prod(shape), dtype="int", sycl_queue=q),
+        shape,
+        order=order,
+    )
+    Y = dpt.tril(X, k=k)
+    Xnp = np.arange(prod(shape), dtype="int").reshape(shape, order=order)
+    Ynp = np.tril(Xnp, k=k)
+    assert Y.dtype == Ynp.dtype
+    assert X.flags == Y.flags
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_meshgrid():
+    q = get_queue_or_skip()
+
+    X = dpt.arange(5, sycl_queue=q)
+    Y = dpt.arange(3, sycl_queue=q)
+    Z = dpt.meshgrid(X, Y)
+    Znp = np.meshgrid(dpt.asnumpy(X), dpt.asnumpy(Y))
+    n = len(Z)
+    assert n == len(Znp)
+    for i in range(n):
+        assert np.array_equal(dpt.asnumpy(Z[i]), Znp[i])
+    assert dpt.meshgrid() == []
+    # dimension > 1 must raise ValueError
+    with pytest.raises(ValueError):
+        dpt.meshgrid(dpt.usm_ndarray((4, 4)))
+    # unknown indexing kwarg must raise ValueError
+    with pytest.raises(ValueError):
+        dpt.meshgrid(X, indexing="ji")
+    # input arrays with different data types must raise ValueError
+    with pytest.raises(ValueError):
+        dpt.meshgrid(X, dpt.asarray(Y, dtype="b1"))
+
+
+def test_meshgrid2():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+    q3 = get_queue_or_skip()
+
+    x1 = dpt.arange(0, 2, dtype="int16", sycl_queue=q1)
+    x2 = dpt.arange(3, 6, dtype="int16", sycl_queue=q2)
+    x3 = dpt.arange(6, 10, dtype="int16", sycl_queue=q3)
+    y1, y2, y3 = dpt.meshgrid(x1, x2, x3, indexing="xy")
+    z1, z2, z3 = dpt.meshgrid(x1, x2, x3, indexing="ij")
+    assert all(
+        x.sycl_queue == y.sycl_queue for x, y in zip((x1, x2, x3), (y1, y2, y3))
+    )
+    assert all(
+        x.sycl_queue == z.sycl_queue for x, z in zip((x1, x2, x3), (z1, z2, z3))
+    )
+    assert y1.shape == y2.shape and y2.shape == y3.shape
+    assert z1.shape == z2.shape and z2.shape == z3.shape
+    assert y1.shape == (len(x2), len(x1), len(x3))
+    assert z1.shape == (len(x1), len(x2), len(x3))
+
+
+def test_common_arg_validation():
+    order = "I"
+    # invalid order must raise ValueError
+    with pytest.raises(ValueError):
+        dpt.empty(10, order=order)
+    with pytest.raises(ValueError):
+        dpt.zeros(10, order=order)
+    with pytest.raises(ValueError):
+        dpt.ones(10, order=order)
+    with pytest.raises(ValueError):
+        dpt.full(10, 1, order=order)
+    with pytest.raises(ValueError):
+        dpt.eye(10, order=order)
+    try:
+        X = dpt.empty(10)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(ValueError):
+        dpt.empty_like(X, order=order)
+    with pytest.raises(ValueError):
+        dpt.zeros_like(X, order=order)
+    with pytest.raises(ValueError):
+        dpt.ones_like(X, order=order)
+    with pytest.raises(ValueError):
+        dpt.full_like(X, 1, order=order)
+    X = {}
+    # test for type validation
+    with pytest.raises(TypeError):
+        dpt.empty_like(X)
+    with pytest.raises(TypeError):
+        dpt.zeros_like(X)
+    with pytest.raises(TypeError):
+        dpt.ones_like(X)
+    with pytest.raises(TypeError):
+        dpt.full_like(X, 1)
+    with pytest.raises(TypeError):
+        dpt.tril(X)
+    with pytest.raises(TypeError):
+        dpt.triu(X)
+    with pytest.raises(TypeError):
+        dpt.meshgrid(X)
+
+
+def test_flags():
+    try:
+        x = dpt.empty(tuple(), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    f = x.flags
+    # check comparison with generic types
+    assert f != Ellipsis
+    f.__repr__()
+    assert f.c_contiguous == f["C"]
+    assert f.f_contiguous == f["F"]
+    assert f.contiguous == f["CONTIGUOUS"]
+    assert f.fc == f["FC"]
+    assert f.forc == f["FORC"]
+    assert f.fnc == f["FNC"]
+    assert f.writable == f["W"]
+
+
+def test_asarray_uint64():
+    Xnp = np.ndarray(1, dtype=np.uint64)
+    try:
+        X = dpt.asarray(Xnp)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert X.dtype == Xnp.dtype
+
+
+def test_Device():
+    try:
+        dev = dpctl.select_default_device()
+        d1 = dpt.Device.create_device(dev)
+        d2 = dpt.Device.create_device(dev)
+    except (dpctl.SyclQueueCreationError, dpctl.SyclDeviceCreationError):
+        pytest.skip(
+            "Could not create default device, or a queue that targets it"
+        )
+    assert d1 == d2
+    dict = {d1: 1}
+    assert dict[d2] == 1
+    assert d1 == d2.sycl_queue
+    assert not d1 == Ellipsis
+
+
+def test_element_offset():
+    n0, n1 = 3, 8
+    try:
+        x = dpt.empty((n0, n1), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert isinstance(x._element_offset, int)
+    assert x._element_offset == 0
+    y = x[::-1, ::2]
+    assert y._element_offset == (n0 - 1) * n1
+
+
+def test_byte_bounds():
+    n0, n1 = 3, 8
+    try:
+        x = dpt.empty((n0, n1), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert isinstance(x._byte_bounds, tuple)
+    assert len(x._byte_bounds) == 2
+    lo, hi = x._byte_bounds
+    assert hi - lo == n0 * n1 * x.itemsize
+    y = x[::-1, ::2]
+    lo, hi = y._byte_bounds
+    assert hi - lo == (n0 * n1 - 1) * x.itemsize
+
+
+def test_gh_1201():
+    n = 100
+    a = np.flipud(np.arange(n, dtype="i4"))
+    try:
+        b = dpt.asarray(a)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    assert (dpt.asnumpy(b) == a).all()
+    c = dpt.flip(dpt.empty(a.shape, dtype=a.dtype))
+    c[:] = a
+    assert (dpt.asnumpy(c) == a).all()
+
+
+class ObjWithSyclUsmArrayInterface:
+    def __init__(self, ary):
+        self._array_obj = ary
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        _suai = self._array_obj.__sycl_usm_array_interface__
+        return _suai
+
+
+@pytest.mark.parametrize("ro_flag", [True, False])
+def test_asarray_writable_flag(ro_flag):
+    try:
+        a = dpt.empty(8)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+
+    a.flags["W"] = not ro_flag
+    wrapped = ObjWithSyclUsmArrayInterface(a)
+
+    b = dpt.asarray(wrapped)
+
+    assert b.flags["W"] == (not ro_flag)
+    assert b._pointer == a._pointer
+
+
+def test_getitem_validation():
+    """Test based on gh-1785"""
+    try:
+        a = dpt.empty((2, 2, 2))
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(IndexError):
+        a[0.0]
+    with pytest.raises(IndexError):
+        a[1, 0.0, ...]
+    with pytest.raises(IndexError):
+        a[1, 0.0, dpt.newaxis, 1]
+    with pytest.raises(IndexError):
+        a[dpt.newaxis, ..., 0.0]
+    with pytest.raises(IndexError):
+        a[dpt.newaxis, ..., 0.0, dpt.newaxis]
+    with pytest.raises(IndexError):
+        a[..., 0.0, dpt.newaxis]
+    with pytest.raises(IndexError):
+        a[:, 0.0, dpt.newaxis]
+
+
+def test_array_like_ctors_order_K():
+    get_queue_or_skip()
+
+    sh = (10, 10)
+    x1 = dpt.zeros(sh, dtype="i4", order="C")
+    r1 = dpt.full_like(x1, 2, order="K")
+    assert dpt.all(r1 == 2)
+    assert r1.flags.c_contiguous
+    r2 = dpt.empty_like(x1, order="K")
+    assert r2.flags.c_contiguous
+    r3 = dpt.ones_like(x1, order="K")
+    assert dpt.all(r3 == 1)
+    assert r3.flags.c_contiguous
+    r4 = dpt.zeros_like(x1, order="K")
+    assert dpt.all(r4 == 0)
+    assert r4.flags.c_contiguous
+
+    x2 = dpt.zeros(sh, dtype="i4", order="F")
+    r5 = dpt.full_like(x2, 2, order="K")
+    assert dpt.all(r5 == 2)
+    assert r5.flags.f_contiguous
+    r6 = dpt.empty_like(x2, order="K")
+    assert r6.flags.f_contiguous
+    r7 = dpt.ones_like(x2, order="K")
+    assert dpt.all(r7 == 1)
+    assert r7.flags.f_contiguous
+    r8 = dpt.zeros_like(x2, order="K")
+    assert dpt.all(r8 == 0)
+    assert r8.flags.f_contiguous
+
+    x3 = dpt.zeros(sh, dtype="i4", order="C")[::-2, :5]
+    st_expected = (-5, 1)
+    r9 = dpt.full_like(x3, 2, order="K")
+    assert dpt.all(r1 == 2)
+    assert r9.strides == st_expected
+    assert not r9.flags.forc
+    r10 = dpt.empty_like(x3, order="K")
+    assert not r10.flags.forc
+    assert r10.strides == st_expected
+    r11 = dpt.ones_like(x3, order="K")
+    assert dpt.all(r11 == 1)
+    assert not r11.flags.forc
+    assert r11.strides == st_expected
+    r12 = dpt.zeros_like(x3, order="K")
+    assert dpt.all(r12 == 0)
+    assert not r12.flags.forc
+    assert r12.strides == st_expected
+
+
+def test_array_like_ctors_order_A():
+    get_queue_or_skip()
+
+    sh = (10, 10)
+    x1 = dpt.zeros(sh, dtype="i4", order="C")
+    r1 = dpt.full_like(x1, 2, order="A")
+    assert dpt.all(r1 == 2)
+    assert r1.flags.c_contiguous
+    r2 = dpt.empty_like(x1, order="A")
+    assert r2.flags.c_contiguous
+    r3 = dpt.ones_like(x1, order="A")
+    assert dpt.all(r3 == 1)
+    assert r3.flags.c_contiguous
+    r4 = dpt.zeros_like(x1, order="A")
+    assert dpt.all(r4 == 0)
+    assert r4.flags.c_contiguous
+
+    x2 = dpt.zeros(sh, dtype="i4", order="F")
+    r5 = dpt.full_like(x2, 2, order="A")
+    assert dpt.all(r5 == 2)
+    assert r5.flags.f_contiguous
+    r6 = dpt.empty_like(x2, order="A")
+    assert r6.flags.f_contiguous
+    r7 = dpt.ones_like(x2, order="A")
+    assert dpt.all(r7 == 1)
+    assert r7.flags.f_contiguous
+    r8 = dpt.zeros_like(x2, order="A")
+    assert dpt.all(r8 == 0)
+    assert r8.flags.f_contiguous
+
+    x3 = dpt.zeros(sh, dtype="i4", order="C")[::-2, :5]
+    r9 = dpt.full_like(x3, 2, order="A")
+    assert dpt.all(r1 == 2)
+    assert r9.flags.c_contiguous
+    r10 = dpt.empty_like(x3, order="A")
+    assert r10.flags.c_contiguous
+    r11 = dpt.ones_like(x3, order="A")
+    assert dpt.all(r11 == 1)
+    assert r11.flags.c_contiguous
+    r12 = dpt.zeros_like(x3, order="A")
+    assert dpt.all(r12 == 0)
+    assert r12.flags.c_contiguous
+
+
+def test_full_like_order_K_array_fill_v():
+    get_queue_or_skip()
+
+    x = dpt.zeros((10, 10), dtype="i4")
+    fill_v = dpt.asarray(2, dtype="i4")
+
+    r1 = dpt.full_like(x, fill_v, order="K")
+    assert dpt.all(r1 == 2)
+
+    # broadcast behavior
+    fill_v = dpt.arange(10, dtype="i4")[:, dpt.newaxis]
+    r1 = dpt.full_like(x, fill_v, order="K")
+    assert dpt.all(r1 == dpt.tile(fill_v, (1, 10)))
+
+
+def test_full_like_order_K_same_input_output_queues():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.zeros((10, 10), dtype="i4", sycl_queue=q1)
+    fill_v = dpt.asarray(2, dtype="i4", sycl_queue=q2)
+
+    r = dpt.full_like(x, fill_v, order="K")
+    assert r.sycl_queue == x.sycl_queue
+
+
+def test_asarray_from_numpy_contig():
+    get_queue_or_skip()
+
+    i_dt = np.int64
+    Xnp = np.arange(32, dtype=i_dt)
+
+    fp_dt = dpt.float32
+    # Use contig copy kernel
+    Xdpt = dpt.asarray(Xnp, dtype=fp_dt)
+
+    assert dpt.all(Xdpt == dpt.arange(32, dtype=fp_dt))
+
+
+def test_setitem_from_numpy_contig():
+    get_queue_or_skip()
+
+    i_dt = np.int64
+    fp_dt = dpt.float32
+
+    Xnp = np.flip(np.arange(32, dtype=i_dt))
+    Xdpt = dpt.flip(dpt.empty(Xnp.shape, dtype=fp_dt))
+    # Use contig copy kernel, after stride simplification
+    Xdpt[:] = Xnp
+
+    expected = dpt.arange(31, stop=-1, step=-1, dtype=fp_dt)
+    assert dpt.all(Xdpt == expected)
+
+    Xnp = np.fliplr(np.reshape(np.arange(-10, 10, dtype=i_dt), (4, 5)))
+    Xdpt = dpt.flip(dpt.empty(Xnp.shape, dtype=fp_dt), axis=-1)
+
+    # after stride simplification, contig kernel is used
+    Xdpt[:] = Xnp
+
+    expected = dpt.reshape(dpt.arange(-10, 10, dtype=fp_dt), (4, 5))
+    assert dpt.all(dpt.flip(Xdpt, axis=-1) == expected)
+
+
+def test_full_functions_raise_type_error():
+    get_queue_or_skip()
+
+    with pytest.raises(TypeError):
+        dpt.full(1, "0")
+
+    x = dpt.ones(1, dtype="i4")
+    with pytest.raises(TypeError):
+        dpt.full_like(x, "0")
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_setitem_copy_as_contig_alignment(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1 = 8, 23
+
+    x = dpt.zeros((n0, n1), dtype=dtype_, sycl_queue=q)
+
+    vals = dpt.ones(n1, dtype=dtype_, sycl_queue=q)[dpt.newaxis, :]
+    x[1:, ...] = vals
+    assert dpt.all(x[0] == 0)
+    assert dpt.all(x[1:, :] == vals)
+
+
+def test_asarray_property():
+    get_queue_or_skip()
+
+    x = dpt.ones(11, dtype="i4")
+
+    with pytest.raises(TypeError):
+        np.asarray(x)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_dlpack.py b/dpnp/tests/tensor/test_usm_ndarray_dlpack.py
new file mode 100644
index 000000000000..7db73467f788
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_dlpack.py
@@ -0,0 +1,919 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import collections
+import ctypes
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+import dpnp.tensor._dlpack as _dlp
+import dpnp.tensor._usmarray as dpt_arr
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+device_CPU = dpt_arr.DLDeviceType.kDLCPU
+device_oneAPI = dpt_arr.DLDeviceType.kDLOneAPI
+
+_usm_types_list = ["shared", "device", "host"]
+
+
+@pytest.fixture(params=_usm_types_list)
+def usm_type(request):
+    return request.param
+
+
+_typestrs_list = [
+    "b1",
+    "u1",
+    "i1",
+    "u2",
+    "i2",
+    "u4",
+    "i4",
+    "u8",
+    "i8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.fixture(params=_typestrs_list)
+def typestr(request):
+    return request.param
+
+
+@pytest.fixture
+def all_root_devices():
+    """
+    Caches root devices. For the sake of speed
+    of test suite execution, keep at most two
+    devices from each platform
+    """
+    devs = dpctl.get_devices()
+    devs_per_platform = collections.defaultdict(list)
+    for dev in devs:
+        devs_per_platform[dev.sycl_platform].append(dev)
+
+    pruned = map(lambda li: li[:2], devs_per_platform.values())
+    return sum(pruned, start=[])
+
+
+def test_dlpack_device(usm_type, all_root_devices):
+    for sycl_dev in all_root_devices:
+        X = dpt.empty((64,), dtype="u1", usm_type=usm_type, device=sycl_dev)
+        dev = X.__dlpack_device__()
+        assert type(dev) is tuple
+        assert len(dev) == 2
+        assert dev[0] == device_oneAPI
+        assert dev[1] == sycl_dev.get_device_id()
+
+
+def test_dlpack_exporter(typestr, usm_type, all_root_devices):
+    caps_fn = ctypes.pythonapi.PyCapsule_IsValid
+    caps_fn.restype = bool
+    caps_fn.argtypes = [ctypes.py_object, ctypes.c_char_p]
+    for sycl_dev in all_root_devices:
+        skip_if_dtype_not_supported(typestr, sycl_dev)
+        X = dpt.empty((64,), dtype=typestr, usm_type=usm_type, device=sycl_dev)
+        caps = X.__dlpack__()
+        assert caps_fn(caps, b"dltensor")
+        Y = X[::2]
+        caps2 = Y.__dlpack__()
+        assert caps_fn(caps2, b"dltensor")
+
+
+def test_dlpack_exporter_empty(typestr, usm_type):
+    caps_fn = ctypes.pythonapi.PyCapsule_IsValid
+    caps_fn.restype = bool
+    caps_fn.argtypes = [ctypes.py_object, ctypes.c_char_p]
+    try:
+        sycl_dev = dpctl.select_default_device()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    skip_if_dtype_not_supported(typestr, sycl_dev)
+    X = dpt.empty((0,), dtype=typestr, usm_type=usm_type, device=sycl_dev)
+    caps = X.__dlpack__()
+    assert caps_fn(caps, b"dltensor")
+    Y = dpt.empty(
+        (
+            1,
+            0,
+        ),
+        dtype=typestr,
+        usm_type=usm_type,
+        device=sycl_dev,
+    )
+    caps = Y.__dlpack__()
+    assert caps_fn(caps, b"dltensor")
+
+
+def test_dlpack_exporter_stream():
+    try:
+        q1 = dpctl.SyclQueue()
+        q2 = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Could not create default queues")
+    X = dpt.empty((64,), dtype="u1", sycl_queue=q1)
+    cap1 = X.__dlpack__(stream=q1)
+    cap2 = X.__dlpack__(stream=q2)
+    assert type(cap1) is type(cap2)
+
+
+@pytest.mark.parametrize("shape", [tuple(), (2,), (3, 0, 1), (2, 2, 2)])
+def test_from_dlpack(shape, typestr, usm_type, all_root_devices):
+    for sycl_dev in all_root_devices:
+        skip_if_dtype_not_supported(typestr, sycl_dev)
+        X = dpt.empty(shape, dtype=typestr, usm_type=usm_type, device=sycl_dev)
+        Y = dpt.from_dlpack(X)
+        assert X.shape == Y.shape
+        assert X.dtype == Y.dtype
+        assert X.usm_type == Y.usm_type
+        assert X._pointer == Y._pointer
+        # we can only expect device to round-trip for USM-device and
+        # USM-shared allocations, which are made for specific device
+        assert (Y.usm_type == "host") or (X.sycl_device == Y.sycl_device)
+        if Y.ndim:
+            V = Y[::-1]
+            W = dpt.from_dlpack(V)
+            assert V.strides == W.strides
+
+
+@pytest.mark.parametrize("mod", [2, 5])
+def test_from_dlpack_strides(mod, typestr, usm_type, all_root_devices):
+    for sycl_dev in all_root_devices:
+        skip_if_dtype_not_supported(typestr, sycl_dev)
+        X0 = dpt.empty(
+            3 * mod, dtype=typestr, usm_type=usm_type, device=sycl_dev
+        )
+        for start in range(mod):
+            X = X0[slice(-start - 1, None, -mod)]
+            Y = dpt.from_dlpack(X)
+            assert X.shape == Y.shape
+            assert X.dtype == Y.dtype
+            assert X.usm_type == Y.usm_type
+            assert X._pointer == Y._pointer
+            # we can only expect device to round-trip for USM-device and
+            # USM-shared allocations, which are made for specific device
+            assert (Y.usm_type == "host") or (X.sycl_device == Y.sycl_device)
+            if Y.ndim:
+                V = Y[::-1]
+                W = dpt.from_dlpack(V)
+                assert V.strides == W.strides
+
+
+def test_from_dlpack_input_validation():
+    v = dpt._dlpack.get_build_dlpack_version()
+    assert type(v) is tuple
+    with pytest.raises(TypeError):
+        dpt.from_dlpack(None)
+
+    class DummyWithProperty:
+        @property
+        def __dlpack__(self):
+            return None
+
+    with pytest.raises(TypeError):
+        dpt.from_dlpack(DummyWithProperty())
+
+    class DummyWithMethod:
+        def __dlpack__(self):
+            return None
+
+    with pytest.raises(TypeError):
+        dpt.from_dlpack(DummyWithMethod())
+
+
+def test_from_dlpack_fortran_contig_array_roundtripping():
+    """Based on examples from issue gh-1241"""
+    n0, n1 = 3, 5
+    try:
+        ar1d = dpt.arange(n0 * n1, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    ar2d_c = dpt.reshape(ar1d, (n0, n1), order="C")
+    ar2d_f = dpt.asarray(ar2d_c, order="F")
+    ar2d_r = dpt.from_dlpack(ar2d_f)
+
+    assert dpt.all(dpt.equal(ar2d_f, ar2d_r))
+    assert dpt.all(dpt.equal(ar2d_c, ar2d_r))
+
+
+def test_dlpack_from_subdevice():
+    """
+    This test checks that array allocated on a sub-device,
+    with memory bound to platform-default SyclContext can be
+    exported and imported via DLPack.
+    """
+    n = 64
+    try:
+        dev = dpctl.SyclDevice()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    try:
+        sdevs = dev.create_sub_devices(partition="next_partitionable")
+    except dpctl.SyclSubDeviceCreationError:
+        sdevs = None
+    try:
+        if sdevs is None:
+            sdevs = dev.create_sub_devices(partition=[1, 1])
+    except dpctl.SyclSubDeviceCreationError:
+        pytest.skip("Default device can not be partitioned")
+    assert isinstance(sdevs, list) and len(sdevs) > 0
+    try:
+        ctx = sdevs[0].sycl_platform.default_context
+    except dpctl.SyclContextCreationError:
+        pytest.skip("Platform's default_context is not available")
+    try:
+        q = dpctl.SyclQueue(ctx, sdevs[0])
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+
+    ar = dpt.arange(n, dtype=dpt.int32, sycl_queue=q)
+    ar2 = dpt.from_dlpack(ar)
+    assert ar2.sycl_device == sdevs[0]
+
+
+def test_legacy_dlpack_capsule():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    legacy_ver = (0, 8)
+
+    cap = x.__dlpack__(max_version=legacy_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x._pointer == y._pointer
+
+    x = dpt.arange(100, dtype="u4")
+    x2 = dpt.reshape(x, (10, 10)).mT
+    cap = x2.__dlpack__(max_version=legacy_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x2._pointer == y._pointer
+    del x2
+
+    x = dpt.arange(100, dtype="f4")
+    x2 = dpt.asarray(dpt.reshape(x, (10, 10)), order="F")
+    cap = x2.__dlpack__(max_version=legacy_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x2._pointer == y._pointer
+
+    x = dpt.arange(100, dtype="c8")
+    x3 = x[::-2]
+    cap = x3.__dlpack__(max_version=legacy_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x3._pointer == y._pointer
+    del x3, y, x
+    del cap
+
+    x = dpt.ones(100, dtype="?")
+    x4 = x[::-2]
+    cap = x4.__dlpack__(max_version=legacy_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x4._pointer == y._pointer
+    del x4, y, x
+    del cap
+
+
+def test_versioned_dlpack_capsule():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap = x.__dlpack__(max_version=max_supported_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x._pointer == y._pointer
+
+    x2 = dpt.asarray(dpt.reshape(x, (10, 10)), order="F")
+    cap = x2.__dlpack__(max_version=max_supported_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x2._pointer == y._pointer
+    del x2
+
+    x3 = x[::-2]
+    cap = x3.__dlpack__(max_version=max_supported_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x3._pointer == y._pointer
+    del x3, y, x
+    del cap
+
+    # read-only array
+    x = dpt.arange(100, dtype="i4")
+    x.flags["W"] = False
+    cap = x.__dlpack__(max_version=max_supported_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x._pointer == y._pointer
+    assert not y.flags.writable
+
+    # read-only array, and copy
+    cap = x.__dlpack__(max_version=max_supported_ver, copy=True)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x._pointer != y._pointer
+    assert not y.flags.writable
+
+
+def test_from_dlpack_kwargs():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    y = dpt.from_dlpack(x, copy=True)
+    assert x._pointer != y._pointer
+
+    z = dpt.from_dlpack(x, device=x.sycl_device)
+    assert z._pointer == x._pointer
+
+
+def test_dlpack_deleters():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    legacy_ver = (0, 8)
+    cap = x.__dlpack__(max_version=legacy_ver)
+    del cap
+
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap = x.__dlpack__(max_version=max_supported_ver)
+    del cap
+
+
+def test_from_dlpack_device():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    out = dpt.from_dlpack(x, device=x.__dlpack_device__())
+    assert x.device == out.device
+    assert x._pointer == out._pointer
+
+    out = dpt.from_dlpack(x, device=x.device)
+    assert x.device == out.device
+    assert x._pointer == out._pointer
+
+    out = dpt.from_dlpack(x, device=x.sycl_device)
+    assert x.device == out.device
+    assert x._pointer == out._pointer
+
+
+def test_used_dlpack_capsule():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    legacy_ver = (0, 8)
+    cap = x.__dlpack__(max_version=legacy_ver)
+    _dlp.from_dlpack_capsule(cap)
+    with pytest.raises(
+        ValueError,
+        match="A DLPack tensor object can not be consumed multiple times",
+    ):
+        _dlp.from_dlpack_capsule(cap)
+    del cap
+
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap = x.__dlpack__(max_version=max_supported_ver)
+    _dlp.from_dlpack_capsule(cap)
+    with pytest.raises(
+        ValueError,
+        match="A DLPack tensor object can not be consumed multiple times",
+    ):
+        _dlp.from_dlpack_capsule(cap)
+    del cap
+
+
+def test_dlpack_size_0():
+    try:
+        x = dpt.ones(0, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    legacy_ver = (0, 8)
+    cap = x.__dlpack__(max_version=legacy_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert y._pointer == x._pointer
+
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap = x.__dlpack__(max_version=max_supported_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert y._pointer == x._pointer
+
+
+def test_dlpack_max_version_validation():
+    try:
+        x = dpt.ones(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    with pytest.raises(
+        TypeError,
+        match=r"`__dlpack__` expects `max_version` to be a "
+        r"2-tuple of integers `\(major, minor\)`, instead "
+        r"got .*",
+    ):
+        x.__dlpack__(max_version=1)
+
+
+def test_dlpack_kwargs():
+    try:
+        q1 = dpctl.SyclQueue()
+        q2 = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Could not create default queues")
+    x = dpt.arange(100, dtype="i4", sycl_queue=q1)
+
+    legacy_ver = (0, 8)
+    cap = x.__dlpack__(stream=q2, max_version=legacy_ver, copy=True)
+    # `copy` ignored for legacy path
+    y = _dlp.from_dlpack_capsule(cap)
+    assert y._pointer == x._pointer
+    del x, y
+    del cap
+
+    x1 = dpt.arange(100, dtype="i4", sycl_queue=q1)
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap = x1.__dlpack__(stream=q2, max_version=max_supported_ver, copy=False)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert y._pointer == x1._pointer
+    del x1, y
+    del cap
+
+    x2 = dpt.arange(100, dtype="i4", sycl_queue=q1)
+    cap = x2.__dlpack__(stream=q2, max_version=max_supported_ver, copy=True)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert y._pointer != x2._pointer
+    del x2, y
+    del cap
+
+
+def _is_capsule(o):
+    t = type(o)
+    return t.__module__ == "builtins" and t.__name__ == "PyCapsule"
+
+
+def test_dlpack_dl_device():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap1 = x.__dlpack__(
+        dl_device=x.__dlpack_device__(), max_version=max_supported_ver
+    )
+    assert _is_capsule(cap1)
+    cap2 = x.__dlpack__(dl_device=(1, 0), max_version=max_supported_ver)
+    assert _is_capsule(cap2)
+    cap3 = x.__dlpack__(
+        dl_device=(device_CPU, 0),
+        max_version=max_supported_ver,
+    )
+    assert _is_capsule(cap3)
+    cap4 = x.__dlpack__(dl_device=("kDLCPU", 0), max_version=max_supported_ver)
+    assert _is_capsule(cap4)
+    with pytest.raises(TypeError):
+        # pass method instead of return of its __call__ invocation
+        x.__dlpack__(
+            dl_device=x.__dlpack_device__, max_version=max_supported_ver
+        )
+    with pytest.raises(TypeError):
+        # exercise check for length
+        x.__dlpack__(dl_device=(3,), max_version=max_supported_ver)
+
+
+def test_from_dlpack_kdlcpu_interop_numpy():
+    """
+    Basic test that usm_ndarray can interoperate with NumPy ndarray
+    `__dlpack_device__`.
+    """
+    get_queue_or_skip()
+
+    sh = 5
+    dt = dpt.int32
+
+    X = dpt.empty(sh, dtype=dt)
+    dl_device_np = np.empty(()).__dlpack_device__()
+
+    Y = dpt.from_dlpack(X, device=dl_device_np)
+    assert isinstance(Y, np.ndarray)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+
+    V = dpt.from_dlpack(Y)
+    assert isinstance(V, np.ndarray)
+    assert Y.shape == V.shape
+    assert Y.dtype == V.dtype
+
+
+@pytest.mark.parametrize("shape", [tuple(), (2,), (3, 0, 1), (2, 2, 2)])
+def test_from_dlpack_to_kdlcpu(shape, typestr):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(typestr, q.sycl_device)
+
+    X = dpt.empty(shape, dtype=typestr, sycl_queue=q)
+    Y = dpt.from_dlpack(X, device=(device_CPU, 0))
+    assert isinstance(Y, np.ndarray)
+    assert X.shape == Y.shape
+    assert X.dtype == Y.dtype
+    # NumPy does not treat size 0 arrays consistently
+    # w.r.t. strides, so skip these cases
+    if X.ndim and X.size != 0:
+        V = Y[::-1]
+        W = dpt.from_dlpack(V)
+        assert V.strides == W.strides
+
+
+@pytest.mark.parametrize("mod", [2, 5])
+def test_from_dlpack_to_kdlcpu_strides(mod, typestr):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(typestr, q.sycl_device)
+
+    X0 = dpt.empty(3 * mod, dtype=typestr, sycl_queue=q)
+    for start in range(mod):
+        X = X0[slice(-start - 1, None, -mod)]
+        Y = dpt.from_dlpack(X, device=(device_CPU, 0))
+        assert X.shape == Y.shape
+        assert X.dtype == Y.dtype
+        if Y.ndim:
+            V = Y[::-1]
+            W = dpt.from_dlpack(V)
+            assert V.strides == W.strides
+
+
+def test_dlpack_from_subdevice_to_kdlcpu():
+    """
+    Check that array allocated on a sub-device can be
+    imported via DLPack to kDLCPU device (as a NumPy array).
+    """
+    n = 64
+    try:
+        dev = dpctl.SyclDevice()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+    try:
+        sdevs = dev.create_sub_devices(partition="next_partitionable")
+    except dpctl.SyclSubDeviceCreationError:
+        sdevs = None
+    try:
+        if sdevs is None:
+            sdevs = dev.create_sub_devices(partition=[1, 1])
+    except dpctl.SyclSubDeviceCreationError:
+        pytest.skip("Default device can not be partitioned")
+    assert isinstance(sdevs, list) and len(sdevs) > 0
+    try:
+        ctx = sdevs[0].sycl_platform.default_context
+    except dpctl.SyclContextCreationError:
+        pytest.skip("Platform's default_context is not available")
+    try:
+        q = dpctl.SyclQueue(ctx, sdevs[0])
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+
+    ar = dpt.arange(n, dtype=dpt.int32, sycl_queue=q)
+    ar2 = dpt.from_dlpack(ar, dl_device=(device_CPU, 0))
+    assert isinstance(ar2, np.ndarray)
+
+
+def test_legacy_dlpack_capsule_from_numpy():
+    """
+    Check that NumPy's exported legacy DLPack capsule
+    will interoperate with from_dlpack_capsule,
+    especially with zero-copy.
+    """
+    x = np.arange(100, dtype="i4")
+    cap = x.__dlpack__()
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x.ctypes.data == y.ctypes.data
+
+    x = np.arange(100, dtype="u4").reshape((10, 10)).T
+    cap = x.__dlpack__()
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x.ctypes.data == y.ctypes.data
+    del x
+
+    x = np.arange(100, dtype="f4").reshape((10, 10), order="F")
+    cap = x.__dlpack__()
+    y = _dlp.from_dlpack_capsule(cap)
+    del cap
+    assert x.ctypes.data == y.ctypes.data
+
+    x = np.arange(100, dtype="c8")
+    x1 = x[::-2]
+    cap = x1.__dlpack__()
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x1.ctypes.data == y.ctypes.data
+    del x1, y, x
+    del cap
+
+    x = np.ones(100, dtype="?")
+    x1 = x[::-2]
+    cap = x1.__dlpack__()
+    y = _dlp.from_dlpack_capsule(cap)
+    assert x1.ctypes.data == y.ctypes.data
+    del x1, y, x
+    del cap
+
+
+def test_dlpack_capsule_readonly_array_to_kdlcpu():
+    try:
+        x = dpt.arange(100, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    # read-only array
+    x.flags["W"] = False
+    cap = x.__dlpack__(max_version=max_supported_ver, dl_device=(device_CPU, 0))
+    y = _dlp.from_dlpack_capsule(cap)
+    assert dpt.all(x == dpt.asarray(y))
+    assert not y.flags["W"]
+
+    cap1 = _dlp.numpy_to_dlpack_versioned_capsule(y, not y.flags["W"])
+    y1 = _dlp.from_dlpack_capsule(cap1)
+    assert not y1.flags["W"]
+
+
+def test_to_dlpack_capsule_c_and_f_contig():
+    try:
+        x = dpt.asarray(np.random.rand(2, 3))
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    cap = _dlp.to_dlpack_capsule(x)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert np.allclose(dpt.asnumpy(x), dpt.asnumpy(y))
+    assert x.strides == y.strides
+
+    x_f = x.T
+    cap = _dlp.to_dlpack_capsule(x_f)
+    yf = _dlp.from_dlpack_capsule(cap)
+    assert np.allclose(dpt.asnumpy(x_f), dpt.asnumpy(yf))
+    assert x_f.strides == yf.strides
+    del cap
+
+
+def test_to_dlpack_versioned_capsule_c_and_f_contig():
+    try:
+        x = dpt.asarray(np.random.rand(2, 3))
+        max_supported_ver = _dlp.get_build_dlpack_version()
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No default device available")
+
+    cap = x.__dlpack__(max_version=max_supported_ver)
+    y = _dlp.from_dlpack_capsule(cap)
+    assert np.allclose(dpt.asnumpy(x), dpt.asnumpy(y))
+    assert x.strides == y.strides
+
+    x_f = x.T
+    cap = x_f.__dlpack__(max_version=max_supported_ver)
+    yf = _dlp.from_dlpack_capsule(cap)
+    assert np.allclose(dpt.asnumpy(x_f), dpt.asnumpy(yf))
+    assert x_f.strides == yf.strides
+    del cap
+
+
+def test_used_dlpack_capsule_from_numpy():
+    get_queue_or_skip()
+
+    x_np = np.arange(100, dtype="i4")
+
+    cap = x_np.__dlpack__()
+    _dlp.from_dlpack_capsule(cap)
+    with pytest.raises(
+        ValueError,
+        match="A DLPack tensor object can not be consumed multiple times",
+    ):
+        _dlp.from_dlpack_capsule(cap)
+    del cap
+
+    x = dpt.asarray(x_np)
+    max_supported_ver = _dlp.get_build_dlpack_version()
+    cap = x.__dlpack__(max_version=max_supported_ver, dl_device=(device_CPU, 0))
+    _dlp.from_dlpack_capsule(cap)
+    with pytest.raises(
+        ValueError,
+        match="A DLPack tensor object can not be consumed multiple times",
+    ):
+        _dlp.from_dlpack_capsule(cap)
+    del cap
+
+
+def test_dlpack_size_0_on_kdlcpu():
+    get_queue_or_skip()
+    x_np = np.ones(0, dtype="i4")
+
+    cap = x_np.__dlpack__()
+    y = _dlp.from_dlpack_capsule(cap)
+    assert y.ctypes.data == x_np.ctypes.data
+
+
+def test_copy_via_host():
+    get_queue_or_skip()
+    x = dpt.ones(1, dtype="i4")
+    x_np = np.ones(1, dtype="i4")
+    x_dl_dev = x.__dlpack_device__()
+    y = dpt.from_dlpack(x_np, device=x_dl_dev)
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.sycl_device == x.sycl_device
+    assert y.usm_type == "device"
+
+    with pytest.raises(ValueError):
+        # incorrect length of tuple
+        dpt.from_dlpack(x_np, device=(1, 0, 0))
+    with pytest.raises(ValueError):
+        # only kDLCPU and kDLOneAPI are supported
+        dpt.from_dlpack(x, device=(2, 0))
+
+    num_devs = dpctl.get_num_devices()
+    if num_devs > 1:
+        j = [i for i in range(num_devs) if i != x_dl_dev[1]][0]
+        z = dpt.from_dlpack(x, device=(x_dl_dev[0], j))
+        assert isinstance(z, dpt.usm_ndarray)
+        assert z.usm_type == "device"
+
+
+def test_copy_via_host_gh_1789():
+    "Test based on review example from gh-1789"
+    get_queue_or_skip()
+    x_np = np.ones((10, 10), dtype="i4")
+    # strides are no longer multiple of itemsize
+    x_np = np.lib.stride_tricks.as_strided(
+        x_np, shape=x_np.shape, strides=(x_np.strides[0] - 1, x_np.strides[1])
+    )
+    with pytest.raises(BufferError):
+        dpt.from_dlpack(x_np)
+    with pytest.raises(BufferError):
+        dpt.from_dlpack(x_np, device=(14, 0))
+
+
+class LegacyContainer:
+    "Helper class implementing legacy `__dlpack__` protocol"
+
+    def __init__(self, array):
+        self._array = array
+
+    def __dlpack__(self, stream=None):
+        return self._array.__dlpack__(stream=stream)
+
+    def __dlpack_device__(self):
+        return self._array.__dlpack_device__()
+
+
+class Container:
+    "Helper class implementing `__dlpack__` protocol version 1.0"
+
+    def __init__(self, array):
+        self._array = array
+
+    def __dlpack__(
+        self, max_version=None, dl_device=None, copy=None, stream=None
+    ):
+        return self._array.__dlpack__(
+            max_version=max_version,
+            dl_device=dl_device,
+            copy=copy,
+            stream=stream,
+        )
+
+    def __dlpack_device__(self):
+        return self._array.__dlpack_device__()
+
+
+def test_generic_container_legacy():
+    get_queue_or_skip()
+    C = LegacyContainer(dpt.linspace(0, 100, num=20, dtype="int16"))
+
+    X = dpt.from_dlpack(C)
+    assert isinstance(X, dpt.usm_ndarray)
+    assert X._pointer == C._array._pointer
+    assert X.sycl_device == C._array.sycl_device
+    assert X.dtype == C._array.dtype
+
+    Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0))
+    assert isinstance(Y, np.ndarray)
+    assert Y.dtype == X.dtype
+
+    Z = dpt.from_dlpack(C, device=X.device)
+    assert isinstance(Z, dpt.usm_ndarray)
+    assert Z._pointer == X._pointer
+    assert Z.device == X.device
+
+
+def test_generic_container_legacy_np():
+    get_queue_or_skip()
+    C = LegacyContainer(np.linspace(0, 100, num=20, dtype="int16"))
+
+    X = dpt.from_dlpack(C)
+    assert isinstance(X, np.ndarray)
+    assert X.ctypes.data == C._array.ctypes.data
+    assert X.dtype == C._array.dtype
+
+    Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0))
+    assert isinstance(Y, np.ndarray)
+    assert Y.dtype == X.dtype
+
+    dev = dpt.Device.create_device()
+    Z = dpt.from_dlpack(C, device=dev)
+    assert isinstance(Z, dpt.usm_ndarray)
+    assert Z.device == dev
+
+
+def test_generic_container():
+    get_queue_or_skip()
+    C = Container(dpt.linspace(0, 100, num=20, dtype="int16"))
+
+    X = dpt.from_dlpack(C)
+    assert isinstance(X, dpt.usm_ndarray)
+    assert X._pointer == C._array._pointer
+    assert X.sycl_device == C._array.sycl_device
+    assert X.dtype == C._array.dtype
+
+    Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0))
+    assert isinstance(Y, np.ndarray)
+    assert Y.dtype == X.dtype
+
+    Z = dpt.from_dlpack(C, device=X.device)
+    assert isinstance(Z, dpt.usm_ndarray)
+    assert Z._pointer == X._pointer
+    assert Z.device == X.device
+
+
+def test_sycl_device_to_dldevice(all_root_devices):
+    for sycl_dev in all_root_devices:
+        dev = dpt.sycl_device_to_dldevice(sycl_dev)
+        assert type(dev) is tuple
+        assert len(dev) == 2
+        assert dev[0] == device_oneAPI
+        assert dev[1] == sycl_dev.get_device_id()
+
+
+def test_dldevice_to_sycl_device(all_root_devices):
+    for sycl_dev in all_root_devices:
+        dldev = dpt.empty(0, device=sycl_dev).__dlpack_device__()
+        dev = dpt.dldevice_to_sycl_device(dldev)
+        assert type(dev) is dpctl.SyclDevice
+        assert dev.get_device_id() == sycl_dev.get_device_id()
+
+
+def test_dldevice_conversion_arg_validation():
+    bad_dldevice_type = (dpt.DLDeviceType.kDLCPU, 0)
+    with pytest.raises(ValueError):
+        dpt.dldevice_to_sycl_device(bad_dldevice_type)
+
+    bad_dldevice_len = bad_dldevice_type + (0,)
+    with pytest.raises(ValueError):
+        dpt.dldevice_to_sycl_device(bad_dldevice_len)
+
+    bad_dldevice = {}
+    with pytest.raises(TypeError):
+        dpt.dldevice_to_sycl_device(bad_dldevice)
+
+    bad_sycldevice = {}
+    with pytest.raises(TypeError):
+        dpt.sycl_device_to_dldevice(bad_sycldevice)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_indexing.py b/dpnp/tests/tensor/test_usm_ndarray_indexing.py
new file mode 100644
index 000000000000..b81e5456872b
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_indexing.py
@@ -0,0 +1,2054 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+import dpnp.tensor as dpt
+import dpnp.tensor._tensor_impl as ti
+from dpnp.tensor._copy_utils import _take_multi_index
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "u1",
+    "i1",
+    "u2",
+    "i2",
+    "u4",
+    "i4",
+    "u8",
+    "i8",
+    "e",
+    "f",
+    "d",
+    "F",
+    "D",
+]
+
+_all_int_dtypes = ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"]
+
+
+def test_basic_slice1():
+    q = get_queue_or_skip()
+    x = dpt.empty(10, dtype="u2", sycl_queue=q)
+    y = x[0]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.ndim == 0
+    assert y.shape == ()
+    assert y.strides == ()
+
+
+def test_basic_slice2():
+    q = get_queue_or_skip()
+    x = dpt.empty(10, dtype="i2", sycl_queue=q)
+    y = x[(0,)]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.ndim == 0
+    assert y.shape == ()
+    assert y.strides == ()
+
+
+def test_basic_slice3():
+    q = get_queue_or_skip()
+    x = dpt.empty(10, dtype="i2", sycl_queue=q)
+    y = x[:]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.ndim == x.ndim
+    assert y.shape == x.shape
+    assert y.strides == x.strides
+    y = x[(slice(None, None, None),)]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.ndim == x.ndim
+    assert y.shape == x.shape
+    assert y.strides == x.strides
+
+
+def test_basic_slice4():
+    q = get_queue_or_skip()
+    n0, n1 = 5, 3
+    x = dpt.empty((n0, n1), dtype="f4", sycl_queue=q)
+    y = x[::-1]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == x.shape
+    assert y.strides == (-x.strides[0], x.strides[1])
+    actual_offset = y.__sycl_usm_array_interface__["offset"]
+    assert actual_offset == (n0 - 1) * n1
+
+
+def test_basic_slice5():
+    q = get_queue_or_skip()
+    n0, n1 = 5, 3
+    x = dpt.empty((n0, n1), dtype="c8", sycl_queue=q)
+    y = x[:, ::-1]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == x.shape
+    assert y.strides == (x.strides[0], -x.strides[1])
+    actual_offset = y.__sycl_usm_array_interface__["offset"]
+    assert actual_offset == (n1 - 1)
+
+
+def test_basic_slice6():
+    q = get_queue_or_skip()
+    i0, n0, n1 = 2, 4, 3
+    x = dpt.empty((n0, n1), dtype="c8", sycl_queue=q)
+    y = x[i0, ::-1]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (x.shape[1],)
+    assert y.strides == (-x.strides[1],)
+    actual_offset = y.__sycl_usm_array_interface__["offset"]
+    expected_offset = i0 * x.strides[0] + (n1 - 1) * x.strides[1]
+    assert actual_offset == expected_offset
+
+
+def test_basic_slice7():
+    q = get_queue_or_skip()
+    n0, n1, n2 = 5, 3, 2
+    x = dpt.empty((n0, n1, n2), dtype="?", sycl_queue=q)
+    y = x[..., ::-1]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == x.shape
+    assert y.strides == (
+        x.strides[0],
+        x.strides[1],
+        -x.strides[2],
+    )
+    actual_offset = y.__sycl_usm_array_interface__["offset"]
+    expected_offset = (n2 - 1) * x.strides[2]
+    assert actual_offset == expected_offset
+
+
+def test_basic_slice8():
+    q = get_queue_or_skip()
+    n0, n1 = 3, 7
+    x = dpt.empty((n0, n1), dtype="u1", sycl_queue=q)
+    y = x[..., dpt.newaxis]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (n0, n1, 1)
+    assert y.strides == (n1, 1, 0)
+
+
+def test_basic_slice9():
+    q = get_queue_or_skip()
+    n0, n1 = 3, 7
+    x = dpt.empty((n0, n1), dtype="u8", sycl_queue=q)
+    y = x[dpt.newaxis, ...]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (1, n0, n1)
+    assert y.strides == (0, n1, 1)
+
+
+def test_basic_slice10():
+    q = get_queue_or_skip()
+    n0, n1, n2 = 3, 7, 5
+    x = dpt.empty((n0, n1, n2), dtype="u1", sycl_queue=q)
+    y = x[dpt.newaxis, ..., :]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (1, n0, n1, n2)
+    assert y.strides == (0, n1 * n2, n2, 1)
+
+
+def _all_equal(it1, it2):
+    return all(bool(x == y) for x, y in zip(it1, it2))
+
+
+def test_advanced_slice1():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+    y = x[ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert y.strides == (1,)
+    assert _all_equal(
+        (x[ii[k]] for k in range(ii.shape[0])),
+        (y[k] for k in range(ii.shape[0])),
+    )
+    y = x[(ii,)]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert y.strides == (1,)
+    assert _all_equal(
+        (x[ii[k]] for k in range(ii.shape[0])),
+        (y[k] for k in range(ii.shape[0])),
+    )
+
+
+def test_advanced_slice1_negative_strides():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([0, 1], sycl_queue=q)
+    x = dpt.flip(dpt.arange(5, dtype="i4", sycl_queue=q))
+    y = x[ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert y.strides == (1,)
+    assert _all_equal(
+        (x[ii[k]] for k in range(ii.shape[0])),
+        (y[k] for k in range(ii.shape[0])),
+    )
+
+
+def test_advanced_slice2():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+    y = x[ii, dpt.newaxis]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape + (1,)
+    assert y.flags["C"]
+
+
+def test_advanced_slice3():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+    y = x[dpt.newaxis, ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (1,) + ii.shape
+    assert y.flags["C"]
+
+
+def _make_3d(dt, q):
+    return dpt.reshape(
+        dpt.arange(3 * 3 * 3, dtype=dt, sycl_queue=q),
+        (
+            3,
+            3,
+            3,
+        ),
+    )
+
+
+def test_advanced_slice4():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = _make_3d("i4", q)
+    y = x[ii, ii, ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert _all_equal(
+        (x[ii[k], ii[k], ii[k]] for k in range(ii.shape[0])),
+        (y[k] for k in range(ii.shape[0])),
+    )
+
+
+def test_advanced_slice5():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = _make_3d("i4", q)
+    y = x[ii, 0, ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    # 0 broadcast to [0, 0] per array API
+    assert y.shape == ii.shape
+    assert _all_equal(
+        (x[ii[i], 0, ii[i]] for i in range(ii.shape[0])),
+        (y[i] for i in range(ii.shape[0])),
+    )
+
+
+def test_advanced_slice6():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = _make_3d("i4", q)
+    y = x[:, ii, ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (
+        x.shape[0],
+        ii.shape[0],
+    )
+    assert _all_equal(
+        (
+            x[i, ii[k], ii[k]]
+            for i in range(x.shape[0])
+            for k in range(ii.shape[0])
+        ),
+        (y[i, k] for i in range(x.shape[0]) for k in range(ii.shape[0])),
+    )
+
+
+def test_advanced_slice7():
+    q = get_queue_or_skip()
+    mask = dpt.asarray(
+        [
+            [[True, True, False], [False, True, True], [True, False, True]],
+            [[True, False, False], [False, False, True], [False, True, False]],
+            [[True, True, True], [False, False, False], [False, False, True]],
+        ],
+        sycl_queue=q,
+    )
+    x = _make_3d("i2", q)
+    y = x[mask]
+    expected = [0, 1, 4, 5, 6, 8, 9, 14, 16, 18, 19, 20, 26]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (len(expected),)
+    assert all(dpt.asnumpy(y[k]) == expected[k] for k in range(len(expected)))
+
+
+def test_advanced_slice8():
+    q = get_queue_or_skip()
+    mask = dpt.asarray(
+        [[True, False, False], [False, True, False], [False, True, False]],
+        sycl_queue=q,
+    )
+    x = _make_3d("u2", q)
+    y = x[mask]
+    expected = dpt.asarray(
+        [[0, 1, 2], [12, 13, 14], [21, 22, 23]], sycl_queue=q
+    )
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()
+
+
+def test_advanced_slice9():
+    q = get_queue_or_skip()
+    mask = dpt.asarray(
+        [[True, False, False], [False, True, False], [False, True, False]],
+        sycl_queue=q,
+    )
+    x = _make_3d("u4", q)
+    y = x[:, mask]
+    expected = dpt.asarray([[0, 4, 7], [9, 13, 16], [18, 22, 25]], sycl_queue=q)
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()
+
+
+def lin_id(i, j, k):
+    """global_linear_id for (3,3,3) range traversed in C-contiguous order"""
+    return 9 * i + 3 * j + k
+
+
+def test_advanced_slice10():
+    q = get_queue_or_skip()
+    x = _make_3d("u8", q)
+    i0 = dpt.asarray([0, 1, 1], device=x.device)
+    i1 = dpt.asarray([1, 1, 2], device=x.device)
+    i2 = dpt.asarray([2, 0, 1], device=x.device)
+    y = x[i0, i1, i2]
+    res_expected = dpt.asarray(
+        [
+            lin_id(0, 1, 2),
+            lin_id(1, 1, 0),
+            lin_id(1, 2, 1),
+        ],
+        sycl_queue=q,
+    )
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == res_expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(res_expected)).all()
+
+
+def test_advanced_slice11():
+    q = get_queue_or_skip()
+    x = _make_3d("u8", q)
+    i0 = dpt.asarray([0, 1, 1], device=x.device)
+    i2 = dpt.asarray([2, 0, 1], device=x.device)
+    with pytest.raises(IndexError):
+        x[i0, :, i2]
+
+
+def test_advanced_slice12():
+    q = get_queue_or_skip()
+    x = _make_3d("u8", q)
+    i1 = dpt.asarray([1, 1, 2], device=x.device)
+    i2 = dpt.asarray([2, 0, 1], device=x.device)
+    y = x[:, dpt.newaxis, i1, i2, dpt.newaxis]
+    res_expected = dpt.asarray(
+        [
+            [[[lin_id(0, 1, 2)], [lin_id(0, 1, 0)], [lin_id(0, 2, 1)]]],
+            [[[lin_id(1, 1, 2)], [lin_id(1, 1, 0)], [lin_id(1, 2, 1)]]],
+            [[[lin_id(2, 1, 2)], [lin_id(2, 1, 0)], [lin_id(2, 2, 1)]]],
+        ],
+        sycl_queue=q,
+    )
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == res_expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(res_expected)).all()
+
+
+def test_advanced_slice13():
+    q = get_queue_or_skip()
+    x = _make_3d("u8", q)
+    i1 = dpt.asarray([[1], [2]], device=x.device)
+    i2 = dpt.asarray([[0, 1]], device=x.device)
+    y = x[i1, i2, 0]
+    expected = dpt.asarray(
+        [
+            [lin_id(1, 0, 0), lin_id(1, 1, 0)],
+            [lin_id(2, 0, 0), lin_id(2, 1, 0)],
+        ],
+        device=x.device,
+    )
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == expected.shape
+    assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()
+
+
+def test_advanced_slice14():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = dpt.reshape(dpt.arange(3**5, dtype="i4", sycl_queue=q), (3,) * 5)
+    y = x[ii, 0, ii, 1, :]
+    assert isinstance(y, dpt.usm_ndarray)
+    # integers broadcast to ii.shape per array API
+    assert y.shape == ii.shape + x.shape[-1:]
+    assert _all_equal(
+        (
+            x[ii[i], 0, ii[i], 1, k]
+            for i in range(ii.shape[0])
+            for k in range(x.shape[-1])
+        ),
+        (y[i, k] for i in range(ii.shape[0]) for k in range(x.shape[-1])),
+    )
+
+
+def test_advanced_slice15():
+    q = get_queue_or_skip()
+    ii = dpt.asarray([1, 2], sycl_queue=q)
+    x = dpt.reshape(dpt.arange(3**5, dtype="i4", sycl_queue=q), (3,) * 5)
+    # : cannot appear between two integral arrays
+    with pytest.raises(IndexError):
+        x[ii, 0, ii, :, ii]
+
+
+def test_advanced_slice16():
+    q = get_queue_or_skip()
+    ii = dpt.asarray(1, sycl_queue=q)
+    i0 = dpt.asarray(False, sycl_queue=q)
+    i1 = dpt.asarray(True, sycl_queue=q)
+    x = dpt.reshape(dpt.arange(3**5, dtype="i4", sycl_queue=q), (3,) * 5)
+    y = x[ii, i0, ii, i1, :]
+    # TODO: add a shape check here when discrepancy with NumPy is investigated
+    assert isinstance(y, dpt.usm_ndarray)
+
+
+def test_integer_indexing_numpy_array():
+    q = get_queue_or_skip()
+    ii = np.asarray([1, 2])
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+    y = x[ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == ii.shape
+    assert dpt.all(x[1:3] == y)
+
+
+def test_boolean_indexing_numpy_array():
+    q = get_queue_or_skip()
+    ii = np.asarray(
+        [False, True, True, False, False, False, False, False, False, False]
+    )
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+    y = x[ii]
+    assert isinstance(y, dpt.usm_ndarray)
+    assert y.shape == (2,)
+    assert dpt.all(x[1:3] == y)
+
+
+def test_boolean_indexing_validation():
+    get_queue_or_skip()
+    x = dpt.zeros(10, dtype="i4")
+    ii = dpt.ones((2, 5), dtype="?")
+    with pytest.raises(IndexError):
+        x[ii]
+    with pytest.raises(IndexError):
+        x[ii[0, :]]
+
+
+def test_boolean_indexing_getitem_empty_mask():
+    get_queue_or_skip()
+    x = dpt.ones((2, 3, 4), dtype="i4")
+    ii = dpt.ones((0,), dtype="?")
+    assert x[ii].size == 0
+    ii1 = dpt.ones((0, 3), dtype="?")
+    assert x[ii1].size == 0
+    ii2 = dpt.ones((0, 3, 4), dtype="?")
+    assert x[ii2].size == 0
+
+
+def test_boolean_indexing_setitem_empty_mask():
+    get_queue_or_skip()
+    x = dpt.ones((2, 3, 4), dtype="i4")
+    ii = dpt.ones((0,), dtype="?")
+    x[ii] = 0
+    assert dpt.all(x == 1)
+    ii1 = dpt.ones((0, 3), dtype="?")
+    x[ii1] = 0
+    assert dpt.all(x == 1)
+    ii2 = dpt.ones((0, 3, 4), dtype="?")
+    x[ii2] = 0
+    assert dpt.all(x == 1)
+
+
+def test_integer_indexing_1d():
+    get_queue_or_skip()
+    x = dpt.arange(10, dtype="i4")
+    ind_1d = dpt.asarray([7, 3, 1], dtype="u2")
+    ind_2d = dpt.asarray([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4")
+
+    y1 = x[ind_1d]
+    assert y1.shape == ind_1d.shape
+    y2 = x[ind_2d]
+    assert y2.shape == ind_2d.shape
+    assert (dpt.asnumpy(y1) == np.array([7, 3, 1], dtype="i4")).all()
+    assert (
+        dpt.asnumpy(y2)
+        == np.array([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4")
+    ).all()
+
+
+def test_integer_indexing_2d():
+    get_queue_or_skip()
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(n0 * n1, dtype="i4"),
+        (
+            n0,
+            n1,
+        ),
+    )
+    ind0 = dpt.arange(n0)
+    ind1 = dpt.arange(n1)
+
+    y = x[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
+    assert y.dtype == x.dtype
+    assert (dpt.asnumpy(y) == np.array([[5, 6], [12, 13]])).all()
+
+
+def test_integer_strided_indexing():
+    get_queue_or_skip()
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(2 * n0 * n1, dtype="i4"),
+        (
+            2 * n0,
+            n1,
+        ),
+    )
+    ind0 = dpt.arange(n0)
+    ind1 = dpt.arange(n1)
+
+    z = x[::-2, :]
+    y = z[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
+    assert y.dtype == x.dtype
+    zc = dpt.copy(z, order="C")
+    yc = zc[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
+    assert (dpt.asnumpy(y) == dpt.asnumpy(yc)).all()
+
+
+def test_TrueFalse_indexing():
+    get_queue_or_skip()
+    n0, n1 = 2, 3
+    x = dpt.ones((n0, n1))
+    for ind in [True, dpt.asarray(True)]:
+        y1 = x[ind]
+        assert y1.shape == (1, n0, n1)
+        assert y1._pointer == x._pointer
+        y2 = x[:, ind]
+        assert y2.shape == (n0, 1, n1)
+        assert y2._pointer == x._pointer
+        y3 = x[..., ind]
+        assert y3.shape == (n0, n1, 1)
+        assert y3._pointer == x._pointer
+    for ind in [False, dpt.asarray(False)]:
+        y1 = x[ind]
+        assert y1.shape == (0, n0, n1)
+        assert y1._pointer == x._pointer
+        y2 = x[:, ind]
+        assert y2.shape == (n0, 0, n1)
+        assert y2._pointer == x._pointer
+        y3 = x[..., ind]
+        assert y3.shape == (n0, n1, 0)
+        assert y3._pointer == x._pointer
+
+
+def test_mixed_index_getitem():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(1000, dtype="i4"), (10, 10, 10))
+    i1b = dpt.ones(10, dtype="?")
+    info = x.__array_namespace__().__array_namespace_info__()
+    ind_dt = info.default_dtypes(device=x.device)["indexing"]
+    i0 = dpt.asarray([0, 2, 3], dtype=ind_dt)[:, dpt.newaxis]
+    i2 = dpt.asarray([3, 4, 7], dtype=ind_dt)[:, dpt.newaxis]
+    y = x[i0, i1b, i2]
+    assert y.shape == (3, dpt.sum(i1b, dtype="i8"))
+
+
+def test_mixed_index_setitem():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(1000, dtype="i4"), (10, 10, 10))
+    i1b = dpt.ones(10, dtype="?")
+    info = x.__array_namespace__().__array_namespace_info__()
+    ind_dt = info.default_dtypes(device=x.device)["indexing"]
+    i0 = dpt.asarray([0, 2, 3], dtype=ind_dt)[:, dpt.newaxis]
+    i2 = dpt.asarray([3, 4, 7], dtype=ind_dt)[:, dpt.newaxis]
+    v_shape = (3, int(dpt.sum(i1b, dtype="i8")))
+    canary = 7
+    x[i0, i1b, i2] = dpt.full(v_shape, canary, dtype=x.dtype)
+    assert x[0, 0, 3] == canary
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_take_basic(data_dt, ind_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(10, dtype=data_dt)
+    ind = dpt.arange(2, 5, dtype=ind_dt)
+    y = dpt.take(x, ind)
+    assert y.dtype == x.dtype
+    assert (dpt.asnumpy(y) == np.arange(2, 5, dtype=data_dt)).all()
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_put_basic(data_dt, ind_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(10, dtype=data_dt)
+    ind = dpt.arange(2, 5, dtype=ind_dt)
+    val = dpt.ones(3, dtype=data_dt)
+    dpt.put(x, ind, val)
+    assert (
+        dpt.asnumpy(x)
+        == np.array([0, 1, 1, 1, 1, 5, 6, 7, 8, 9], dtype=data_dt)
+    ).all()
+
+
+def test_take_basic_axis():
+    get_queue_or_skip()
+
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(n0 * n1, dtype="i4"),
+        (
+            n0,
+            n1,
+        ),
+    )
+    ind = dpt.arange(2, 4)
+    y0 = dpt.take(x, ind, axis=0)
+    y1 = dpt.take(x, ind, axis=1)
+    assert y0.shape == (2, n1)
+    assert y1.shape == (n0, 2)
+
+
+def test_put_basic_axis():
+    get_queue_or_skip()
+
+    n0, n1 = 5, 7
+    x = dpt.reshape(
+        dpt.arange(n0 * n1, dtype="i4"),
+        (
+            n0,
+            n1,
+        ),
+    )
+    ind = dpt.arange(2, 4)
+    v0 = dpt.zeros((2, n1), dtype=x.dtype)
+    v1 = dpt.zeros((n0, 2), dtype=x.dtype)
+    dpt.put(x, ind, v0, axis=0)
+    dpt.put(x, ind, v1, axis=1)
+    expected = np.arange(n0 * n1, dtype="i4").reshape((n0, n1))
+    expected[[2, 3], :] = 0
+    expected[:, [2, 3]] = 0
+    assert (expected == dpt.asnumpy(x)).all()
+
+
+@pytest.mark.parametrize("data_dt", _all_dtypes)
+def test_put_0d_val(data_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(5, dtype=data_dt, sycl_queue=q)
+    ind = dpt.asarray([0], dtype="i8", sycl_queue=q)
+    val = dpt.asarray(2, dtype=x.dtype, sycl_queue=q)
+    x[ind] = val
+    assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x[0]))
+
+    x = dpt.asarray(5, dtype=data_dt, sycl_queue=q)
+    dpt.put(x, ind, val)
+    assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x))
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+def test_take_0d_data(data_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.asarray(0, dtype=data_dt, sycl_queue=q)
+    ind = dpt.arange(5, dtype="i8", sycl_queue=q)
+
+    y = dpt.take(x, ind)
+    assert (
+        dpt.asnumpy(y)
+        == np.broadcast_to(np.asarray(0, dtype=data_dt), ind.shape)
+    ).all()
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+def test_put_0d_data(data_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.asarray(0, dtype=data_dt, sycl_queue=q)
+    ind = dpt.arange(5, dtype="i8", sycl_queue=q)
+    val = dpt.asarray(2, dtype=data_dt, sycl_queue=q)
+
+    dpt.put(x, ind, val, axis=0)
+    assert (
+        dpt.asnumpy(x)
+        == np.broadcast_to(np.asarray(2, dtype=data_dt), ind.shape)
+    ).all()
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_indexing_0d_ind(ind_dt):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4", sycl_queue=q)
+    ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q)
+
+    y = x[ind]
+    assert dpt.asnumpy(x[3]) == dpt.asnumpy(y)
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_put_0d_ind(ind_dt):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4", sycl_queue=q)
+    ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q)
+    val = dpt.asarray(5, dtype=x.dtype, sycl_queue=q)
+
+    x[ind] = val
+    assert dpt.asnumpy(x[3]) == dpt.asnumpy(val)
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+def test_take_strided_1d_source(data_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(27, dtype=data_dt, sycl_queue=q)
+    ind = dpt.arange(4, 9, dtype="i8", sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        assert_array_equal(
+            np.take(x_np[s], ind_np, axis=0),
+            dpt.asnumpy(dpt.take(x[s], ind, axis=0)),
+        )
+
+    # 0-strided
+    x = dpt.usm_ndarray(
+        (27,),
+        dtype=data_dt,
+        strides=(0,),
+        buffer_ctor_kwargs={"queue": q},
+    )
+    x[0] = x_np[0]
+    assert_array_equal(
+        np.broadcast_to(x_np[0], ind.shape),
+        dpt.asnumpy(dpt.take(x, ind, axis=0)),
+    )
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_take_strided(data_dt, order):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
+    ind = dpt.arange(2, dtype="i8", sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        for sgn in (-1, 1):
+            xs = x[s, ::sgn]
+            xs_np = x_np[s, ::sgn]
+            assert_array_equal(
+                np.take(xs_np, ind_np, axis=0),
+                dpt.asnumpy(dpt.take(xs, ind, axis=0)),
+            )
+            assert_array_equal(
+                np.take(xs_np, ind_np, axis=1),
+                dpt.asnumpy(dpt.take(xs, ind, axis=1)),
+            )
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_take_strided_1d_indices(ind_dt):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(27, dtype="i4", sycl_queue=q)
+    ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind).astype("i8")
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        assert_array_equal(
+            np.take(x_np, ind_np[s], axis=0),
+            dpt.asnumpy(dpt.take(x, ind[s], axis=0)),
+        )
+
+    # 0-strided
+    ind = dpt.usm_ndarray(
+        (12,),
+        dtype=ind_dt,
+        strides=(0,),
+        buffer_ctor_kwargs={"queue": q},
+    )
+    ind[0] = ind_np[0]
+    assert_array_equal(
+        np.broadcast_to(x_np[ind_np[0]], ind.shape),
+        dpt.asnumpy(dpt.take(x, ind, axis=0)),
+    )
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_take_strided_indices(ind_dt, order):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(27, dtype="i4", sycl_queue=q)
+    ind = dpt.reshape(
+        dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q), (4, 3), order=order
+    )
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind).astype("i8")
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        for sgn in [-1, 1]:
+            inds = ind[s, ::sgn]
+            inds_np = ind_np[s, ::sgn]
+            assert_array_equal(
+                np.take(x_np, inds_np, axis=0),
+                dpt.asnumpy(x[inds]),
+            )
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_put_strided_1d_destination(data_dt, order):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.arange(27, dtype=data_dt, sycl_queue=q)
+    ind = dpt.arange(4, 9, dtype="i8", sycl_queue=q)
+    val = dpt.asarray(9, dtype=x.dtype, sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind)
+    val_np = dpt.asnumpy(val)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        x_np1 = x_np.copy()
+        x_np1[s][ind_np] = val_np
+
+        x1 = dpt.copy(x)
+        dpt.put(x1[s], ind, val, axis=0)
+
+        assert_array_equal(x_np1, dpt.asnumpy(x1))
+
+
+@pytest.mark.parametrize(
+    "data_dt",
+    _all_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_put_strided_destination(data_dt, order):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
+    ind = dpt.arange(2, dtype="i8", sycl_queue=q)
+    val = dpt.asarray(9, dtype=x.dtype, sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind)
+    val_np = dpt.asnumpy(val)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        for sgn in [-1, 1]:
+            xs = x[s, ::sgn]
+            xs_np = x_np[s, ::sgn]
+
+            x_np1 = xs_np.copy()
+            x_np1[ind_np] = val_np
+
+            x1 = dpt.copy(xs)
+            dpt.put(x1, ind, val, axis=0)
+            assert_array_equal(x_np1, dpt.asnumpy(x1))
+
+            x_np1 = xs_np.copy()
+            x_np1[:, ind_np] = val_np
+
+            x1 = dpt.copy(xs)
+            dpt.put(x1, ind, val, axis=1)
+            assert_array_equal(x_np1, dpt.asnumpy(x1))
+
+            x_np1 = xs_np.copy()
+            x_np1[ind_np, ind_np] = val_np
+
+            x1 = dpt.copy(xs)
+            x1[ind, ind] = val
+            assert_array_equal(x_np1, dpt.asnumpy(x1))
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+def test_put_strided_1d_indices(ind_dt):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(27, dtype="i4", sycl_queue=q)
+    ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q)
+    val = dpt.asarray(-1, dtype=x.dtype, sycl_queue=q)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind).astype("i8")
+    val_np = dpt.asnumpy(val)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        x_copy = dpt.copy(x)
+        dpt.put(x_copy, ind[s], val, axis=0)
+
+        x_np_copy = x_np.copy()
+        x_np_copy[ind_np[s]] = val_np
+
+        assert_array_equal(x_np_copy, dpt.asnumpy(x_copy))
+
+
+@pytest.mark.parametrize(
+    "ind_dt",
+    _all_int_dtypes,
+)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_put_strided_indices(ind_dt, order):
+    q = get_queue_or_skip()
+
+    x = dpt.arange(27, dtype="i4", sycl_queue=q)
+    ind = dpt.reshape(
+        dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q), (4, 3), order=order
+    )
+    val = dpt.asarray(-1, sycl_queue=q, dtype=x.dtype)
+
+    x_np = dpt.asnumpy(x)
+    ind_np = dpt.asnumpy(ind).astype("i8")
+    val_np = dpt.asnumpy(val)
+
+    for s in (
+        slice(None, None, 2),
+        slice(None, None, -2),
+    ):
+        for sgn in [-1, 1]:
+            inds = ind[s, ::sgn]
+            inds_np = ind_np[s, ::sgn]
+
+            x_copy = dpt.copy(x)
+            x_copy[inds] = val
+
+            x_np_copy = x_np.copy()
+            x_np_copy[inds_np] = val_np
+
+            assert_array_equal(x_np_copy, dpt.asnumpy(x_copy))
+
+
+def test_integer_indexing_modes():
+    q = get_queue_or_skip()
+
+    x = dpt.arange(5, sycl_queue=q)
+    x_np = dpt.asnumpy(x)
+
+    # wrapping negative indices
+    ind = dpt.asarray([-4, -3, 0, 2, 4], dtype="i8", sycl_queue=q)
+
+    res = dpt.take(x, ind, mode="wrap")
+    expected_arr = np.take(x_np, dpt.asnumpy(ind), mode="raise")
+
+    assert (dpt.asnumpy(res) == expected_arr).all()
+
+    # clipping to 0 (disabling negative indices)
+    ind = dpt.asarray([-6, -3, 0, 2, 6], dtype="i8", sycl_queue=q)
+
+    res = dpt.take(x, ind, mode="clip")
+    expected_arr = np.take(x_np, dpt.asnumpy(ind), mode="clip")
+
+    assert (dpt.asnumpy(res) == expected_arr).all()
+
+
+def test_take_arg_validation():
+    q = get_queue_or_skip()
+
+    x = dpt.arange(4, dtype="i4", sycl_queue=q)
+    ind0 = dpt.arange(4, dtype="i8", sycl_queue=q)
+    ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q)
+
+    with pytest.raises(TypeError):
+        dpt.take(dict(), ind0, axis=0)
+    with pytest.raises(TypeError):
+        dpt.take(x, dict(), axis=0)
+    with pytest.raises(IndexError):
+        x[[]]
+    with pytest.raises(IndexError):
+        dpt.take(x, ind1, axis=0)
+    with pytest.raises(IndexError):
+        x[ind1]
+
+    with pytest.raises(ValueError):
+        dpt.take(dpt.reshape(x, (2, 2)), ind0)
+    with pytest.raises(ValueError):
+        dpt.take(x, ind0, mode=0)
+    with pytest.raises(ValueError):
+        dpt.take(dpt.reshape(x, (2, 2)), ind0, axis=None)
+    with pytest.raises(ValueError):
+        dpt.take(x, dpt.reshape(ind0, (2, 2)))
+    with pytest.raises(ValueError):
+        dpt.take(x[0], ind0, axis=2)
+    with pytest.raises(ValueError):
+        dpt.take(x[:, dpt.newaxis, dpt.newaxis], ind0, axis=None)
+
+
+def test_put_arg_validation():
+    q = get_queue_or_skip()
+
+    x = dpt.arange(4, dtype="i4", sycl_queue=q)
+    ind0 = dpt.arange(4, dtype="i8", sycl_queue=q)
+    ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q)
+    val = dpt.asarray(2, dtype=x.dtype, sycl_queue=q)
+
+    with pytest.raises(TypeError):
+        dpt.put(dict(), ind0, val, axis=0)
+    with pytest.raises(TypeError):
+        dpt.put(x, dict(), val, axis=0)
+    with pytest.raises(IndexError):
+        x[[]] = val
+    with pytest.raises(IndexError):
+        dpt.put(x, ind1, val, axis=0)
+    with pytest.raises(IndexError):
+        x[ind1] = val
+    with pytest.raises(TypeError):
+        dpt.put(x, ind0, {}, axis=0)
+    with pytest.raises(TypeError):
+        x[ind0] = {}
+
+    with pytest.raises(ValueError):
+        dpt.put(x, ind0, val, mode=0)
+    with pytest.raises(ValueError):
+        dpt.put(x, dpt.reshape(ind0, (2, 2)), val)
+    with pytest.raises(ValueError):
+        dpt.put(x[0], ind0, val, axis=2)
+    with pytest.raises(ValueError):
+        dpt.put(x[:, dpt.newaxis, dpt.newaxis], ind0, val, axis=None)
+
+
+def test_advanced_indexing_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.arange(4, sycl_queue=q1)
+    ind0 = dpt.asarray([0], sycl_queue=q1)
+    ind1 = dpt.asarray([0], sycl_queue=q2)
+    val0 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q1)
+    val1 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q2)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.take(x, ind1, axis=0)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        x[ind1]
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.put(x, ind1, val0, axis=0)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        x[ind1] = val0
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.put(x, ind0, val1, axis=0)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        x[ind0] = val1
+
+
+def test_extract_all_1d():
+    get_queue_or_skip()
+    x = dpt.arange(30, dtype="i4")
+    sel = dpt.ones(30, dtype="?")
+    sel[::2] = False
+
+    res = x[sel]
+    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+    res2 = dpt.extract(sel, x)
+    assert (dpt.asnumpy(res2) == expected_res).all()
+
+    # test strided case
+    x = dpt.arange(15, dtype="i4")
+    sel_np = np.zeros(15, dtype="?")
+    np.put(sel_np, np.random.choice(sel_np.size, size=7), True)
+    sel = dpt.asarray(sel_np)
+
+    res = x[sel[::-1]]
+    expected_res = dpt.asnumpy(x)[sel_np[::-1]]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+    res2 = dpt.extract(sel[::-1], x)
+    assert (dpt.asnumpy(res2) == expected_res).all()
+
+
+def test_extract_all_2d():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
+    sel = dpt.ones(30, dtype="?")
+    sel[::2] = False
+    sel = dpt.reshape(sel, x.shape)
+
+    res = x[sel]
+    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+    res2 = dpt.extract(sel, x)
+    assert (dpt.asnumpy(res2) == expected_res).all()
+
+
+def test_extract_2D_axis0():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
+    sel = dpt.ones(x.shape[0], dtype="?")
+    sel[::2] = False
+
+    res = x[sel]
+    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+
+def test_extract_2D_axis1():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
+    sel = dpt.ones(x.shape[1], dtype="?")
+    sel[::2] = False
+
+    res = x[:, sel]
+    expected = dpt.asnumpy(x)[:, dpt.asnumpy(sel)]
+    assert (dpt.asnumpy(res) == expected).all()
+
+
+def test_extract_begin():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 3), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[sel]
+    expected = dpt.asnumpy(y)[[0, 1], [0, 1]]
+    assert (dpt.asnumpy(z) == expected).all()
+
+
+def test_extract_end():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((4, 4), dtype="?")
+    sel[0, 0] = True
+    z = y[..., sel]
+    expected = dpt.asnumpy(y)[..., [0], [0]]
+    assert (dpt.asnumpy(z) == expected).all()
+
+
+def test_extract_middle():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 4), dtype="?")
+    sel[0, 0] = True
+    z = y[:, sel]
+    expected = dpt.asnumpy(y)[:, [0], [0], :]
+    assert (dpt.asnumpy(z) == expected).all()
+
+
+def test_extract_empty_result():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 4), dtype="?")
+    z = y[:, sel]
+    assert z.shape == (
+        y.shape[0],
+        0,
+        y.shape[3],
+    )
+
+
+def test_place_all_1d():
+    get_queue_or_skip()
+    x = dpt.arange(10, dtype="i2")
+    sel = dpt.zeros(10, dtype="?")
+    sel[0::2] = True
+    val = dpt.zeros(5, dtype=x.dtype)
+    x[sel] = val
+    assert (dpt.asnumpy(x) == np.array([0, 1, 0, 3, 0, 5, 0, 7, 0, 9])).all()
+    dpt.place(x, sel, dpt.asarray([2]))
+    assert (dpt.asnumpy(x) == np.array([2, 1, 2, 3, 2, 5, 2, 7, 2, 9])).all()
+
+
+def test_place_2d_axis0():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray([True, False, True])
+    val = dpt.zeros((2, 4), dtype=x.dtype)
+    x[sel] = val
+    expected_x = np.stack(
+        (
+            np.zeros(4, dtype="i2"),
+            np.arange(4, 8, dtype="i2"),
+            np.zeros(4, dtype="i2"),
+        )
+    )
+    assert (dpt.asnumpy(x) == expected_x).all()
+
+
+def test_place_2d_axis1():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray([True, False, True, False])
+    val = dpt.zeros((3, 2), dtype=x.dtype)
+    x[:, sel] = val
+    expected_x = np.array(
+        [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2"
+    )
+    assert (dpt.asnumpy(x) == expected_x).all()
+
+
+def test_place_2d_axis1_scalar():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray([True, False, True, False])
+    val = dpt.zeros(tuple(), dtype=x.dtype)
+    x[:, sel] = val
+    expected_x = np.array(
+        [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2"
+    )
+    assert (dpt.asnumpy(x) == expected_x).all()
+
+
+def test_place_all_slices():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
+    sel = dpt.asarray(
+        [
+            [False, True, True, False],
+            [True, True, False, False],
+            [False, False, True, True],
+        ],
+        dtype="?",
+    )
+    y = dpt.ones_like(x)
+    y[sel] = x[sel]
+
+
+def test_place_some_slices_begin():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 3), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[sel]
+    w = dpt.zeros_like(y)
+    w[sel] = z
+
+
+def test_place_some_slices_mid():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((3, 4), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[:, sel]
+    w = dpt.zeros_like(y)
+    w[:, sel] = z
+
+
+def test_place_some_slices_end():
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
+    y = dpt.permute_dims(x, (2, 0, 3, 1))
+    sel = dpt.zeros((4, 4), dtype="?")
+    sel[0, 0] = True
+    sel[1, 1] = True
+    z = y[:, :, sel]
+    w = dpt.zeros_like(y)
+    w[:, :, sel] = z
+
+
+def test_place_cycling():
+    get_queue_or_skip()
+    x = dpt.zeros(10, dtype="f4")
+    y = dpt.asarray([2, 3])
+    sel = dpt.ones(x.size, dtype="?")
+    dpt.place(x, sel, y)
+    expected = np.array(
+        [
+            2,
+            3,
+        ]
+        * 5,
+        dtype=x.dtype,
+    )
+    assert (dpt.asnumpy(x) == expected).all()
+
+
+def test_place_subset():
+    get_queue_or_skip()
+    x = dpt.zeros(10, dtype="f4")
+    y = dpt.ones_like(x)
+    sel = dpt.ones(x.size, dtype="?")
+    sel[::2] = False
+    dpt.place(x, sel, y)
+    expected = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1], dtype=x.dtype)
+    assert (dpt.asnumpy(x) == expected).all()
+
+
+def test_place_empty_vals_error():
+    get_queue_or_skip()
+    x = dpt.zeros(10, dtype="f4")
+    y = dpt.empty((0,), dtype=x.dtype)
+    sel = dpt.ones(x.size, dtype="?")
+    sel[::2] = False
+    with pytest.raises(ValueError):
+        dpt.place(x, sel, y)
+
+
+def test_place_empty_vals_full_false_mask():
+    get_queue_or_skip()
+    x = dpt.ones(10, dtype="f4")
+    y = dpt.empty((0,), dtype=x.dtype)
+    sel = dpt.zeros(x.size, dtype="?")
+    expected = np.ones(10, dtype=x.dtype)
+    dpt.place(x, sel, y)
+    assert (dpt.asnumpy(x) == expected).all()
+
+
+def test_nonzero():
+    get_queue_or_skip()
+    x = dpt.concat((dpt.zeros(3), dpt.ones(4), dpt.zeros(3)))
+    (i,) = dpt.nonzero(x)
+    assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all()
+
+
+def test_nonzero_f_contig():
+    "See gh-1370"
+    get_queue_or_skip()
+
+    mask = dpt.zeros((5, 5), dtype="?", order="F")
+    mask[2, 3] = True
+
+    expected_res = np.nonzero(dpt.asnumpy(mask))
+    result = dpt.nonzero(mask)
+
+    for exp, res in zip(expected_res, result):
+        assert_array_equal(dpt.asnumpy(res), exp)
+    assert dpt.asnumpy(mask[result]).all()
+
+
+def test_nonzero_compacting():
+    """See gh-1370.
+    Test with input where dimensionality
+    of iteration space is compacted from 3d to 2d
+    """
+    get_queue_or_skip()
+
+    mask = dpt.zeros((5, 5, 5), dtype="?", order="F")
+    mask[3, 2, 1] = True
+    mask_view = mask[..., :3]
+
+    expected_res = np.nonzero(dpt.asnumpy(mask_view))
+    result = dpt.nonzero(mask_view)
+
+    for exp, res in zip(expected_res, result):
+        assert_array_equal(dpt.asnumpy(res), exp)
+    assert dpt.asnumpy(mask_view[result]).all()
+
+
+def test_assign_scalar():
+    get_queue_or_skip()
+    x = dpt.arange(-5, 5, dtype="i8")
+    cond = dpt.asarray(
+        [True, True, True, True, True, False, False, False, False, False]
+    )
+    x[cond] = 0  # no error expected
+    x[dpt.nonzero(cond)] = -1
+    expected = np.array([-1, -1, -1, -1, -1, 0, 1, 2, 3, 4], dtype=x.dtype)
+    assert (dpt.asnumpy(x) == expected).all()
+
+
+def test_nonzero_large():
+    get_queue_or_skip()
+    m = dpt.full((60, 80), True)
+    assert m[m].size == m.size
+
+    m = dpt.full((30, 60, 80), True)
+    assert m[m].size == m.size
+
+
+def test_extract_arg_validation():
+    get_queue_or_skip()
+    with pytest.raises(TypeError):
+        dpt.extract(None, None)
+    cond = dpt.ones(10, dtype="?")
+    with pytest.raises(TypeError):
+        dpt.extract(cond, None)
+    q1 = dpctl.SyclQueue()
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.extract(cond.to_device(q1), dpt.zeros_like(cond, dtype="u1"))
+    with pytest.raises(ValueError):
+        dpt.extract(dpt.ones((2, 3), dtype="?"), dpt.ones((3, 2), dtype="i1"))
+
+
+def test_place_arg_validation():
+    get_queue_or_skip()
+    with pytest.raises(TypeError):
+        dpt.place(None, None, None)
+    arr = dpt.zeros(8, dtype="i1")
+    with pytest.raises(TypeError):
+        dpt.place(arr, None, None)
+    cond = dpt.ones(8, dtype="?")
+    with pytest.raises(TypeError):
+        dpt.place(arr, cond, None)
+    vals = dpt.ones_like(arr)
+    q1 = dpctl.SyclQueue()
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.place(arr.to_device(q1), cond, vals)
+    with pytest.raises(ValueError):
+        dpt.place(dpt.reshape(arr, (2, 2, 2)), cond, vals)
+
+
+def test_nonzero_arg_validation():
+    get_queue_or_skip()
+    with pytest.raises(TypeError):
+        dpt.nonzero(list())
+    with pytest.raises(ValueError):
+        dpt.nonzero(dpt.asarray(1))
+
+
+def test_nonzero_dtype():
+    "See gh-1322"
+    get_queue_or_skip()
+    x = dpt.ones((3, 4))
+    idx, idy = dpt.nonzero(x)
+    # create array using device's
+    # default index data type
+    index_dt = dpt.dtype(ti.default_device_index_type(x.sycl_queue))
+    assert idx.dtype == index_dt
+    assert idy.dtype == index_dt
+
+
+def test_take_empty_axes():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 0, 4, 5, 6), dtype="f4")
+    inds = dpt.ones(1, dtype="i4")
+
+    with pytest.raises(IndexError):
+        dpt.take(x, inds, axis=1)
+
+    inds = dpt.ones(0, dtype="i4")
+    r = dpt.take(x, inds, axis=1)
+    assert r.shape == x.shape
+
+
+def test_put_empty_axes():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 0, 4, 5, 6), dtype="f4")
+    inds = dpt.ones(1, dtype="i4")
+    vals = dpt.zeros((3, 1, 4, 5, 6), dtype="f4")
+
+    with pytest.raises(IndexError):
+        dpt.put(x, inds, vals, axis=1)
+
+    inds = dpt.ones(0, dtype="i4")
+    vals = dpt.zeros_like(x)
+
+    with pytest.raises(ValueError):
+        dpt.put(x, inds, vals, axis=1)
+
+
+def test_put_cast_vals():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    inds = dpt.arange(7, 10, dtype="i4")
+    vals = dpt.zeros_like(inds, dtype="f4")
+
+    dpt.put(x, inds, vals)
+    assert dpt.all(x[7:10] == 0)
+
+
+def test_advanced_integer_indexing_cast_vals():
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    inds = dpt.arange(7, 10, dtype="i4")
+    vals = dpt.zeros_like(inds, dtype="f4")
+
+    x[inds] = vals
+    assert dpt.all(x[7:10] == 0)
+
+
+def test_advanced_integer_indexing_empty_axis():
+    get_queue_or_skip()
+
+    # getting
+    x = dpt.ones((3, 0, 4, 5, 6), dtype="f4")
+    inds = dpt.ones(1, dtype="i4")
+    with pytest.raises(IndexError):
+        x[:, inds, ...]
+    with pytest.raises(IndexError):
+        x[inds, inds, inds, ...]
+
+    # setting
+    with pytest.raises(IndexError):
+        x[:, inds, ...] = 2
+    with pytest.raises(IndexError):
+        x[inds, inds, inds, ...] = 2
+
+    # empty inds
+    inds = dpt.ones(0, dtype="i4")
+    assert x[:, inds, ...].shape == x.shape
+    assert x[inds, inds, inds, ...].shape == (0, 5, 6)
+
+    vals = dpt.zeros_like(x)
+    x[:, inds, ...] = vals
+    vals = dpt.zeros((0, 5, 6), dtype="f4")
+    x[inds, inds, inds, ...] = vals
+
+
+def test_advanced_integer_indexing_cast_indices():
+    get_queue_or_skip()
+
+    inds0 = dpt.asarray([0, 1], dtype="i1")
+    for ind_dts in (("i1", "i2", "i4"), ("i1", "u4", "i4"), ("u1", "u2", "u8")):
+        x = dpt.ones((3, 4, 5, 6), dtype="i4")
+        inds0 = dpt.asarray([0, 1], dtype=ind_dts[0])
+        inds1 = dpt.astype(inds0, ind_dts[1])
+        x[inds0, inds1, ...] = 2
+        assert dpt.all(x[inds0, inds1, ...] == 2)
+        inds2 = dpt.astype(inds0, ind_dts[2])
+        x[inds0, inds1, ...] = 2
+        assert dpt.all(x[inds0, inds1, inds2, ...] == 2)
+
+    # fail when float would be required per type promotion
+    inds0 = dpt.asarray([0, 1], dtype="i1")
+    inds1 = dpt.astype(inds0, "u4")
+    inds2 = dpt.astype(inds0, "u8")
+    x = dpt.ones((3, 4, 5, 6), dtype="i4")
+    # test getitem
+    with pytest.raises(ValueError):
+        x[inds0, inds1, inds2, ...]
+    # test setitem
+    with pytest.raises(ValueError):
+        x[inds0, inds1, inds2, ...] = 1
+
+
+def test_take_along_axis():
+    get_queue_or_skip()
+
+    n0, n1, n2 = 3, 5, 7
+    x = dpt.reshape(dpt.arange(n0 * n1 * n2), (n0, n1, n2))
+    ind_dt = dpt.__array_namespace_info__().default_dtypes(
+        device=x.sycl_device
+    )["indexing"]
+    ind0 = dpt.ones((1, n1, n2), dtype=ind_dt)
+    ind1 = dpt.ones((n0, 1, n2), dtype=ind_dt)
+    ind2 = dpt.ones((n0, n1, 1), dtype=ind_dt)
+
+    y0 = dpt.take_along_axis(x, ind0, axis=0)
+    assert y0.shape == ind0.shape
+    y1 = dpt.take_along_axis(x, ind1, axis=1)
+    assert y1.shape == ind1.shape
+    y2 = dpt.take_along_axis(x, ind2, axis=2)
+    assert y2.shape == ind2.shape
+
+
+def test_take_along_axis_validation():
+    # validate first argument
+    with pytest.raises(TypeError):
+        dpt.take_along_axis(tuple(), list())
+    get_queue_or_skip()
+    n1, n2 = 2, 5
+    x = dpt.ones(n1 * n2)
+    # validate second argument
+    with pytest.raises(TypeError):
+        dpt.take_along_axis(x, list())
+    x_dev = x.sycl_device
+    info_ = dpt.__array_namespace_info__()
+    def_dtypes = info_.default_dtypes(device=x_dev)
+    ind_dt = def_dtypes["indexing"]
+    ind = dpt.zeros(1, dtype=ind_dt)
+    # axis validation
+    with pytest.raises(ValueError):
+        dpt.take_along_axis(x, ind, axis=1)
+    # mode validation
+    with pytest.raises(ValueError):
+        dpt.take_along_axis(x, ind, axis=0, mode="invalid")
+    # same array-ranks validation
+    with pytest.raises(ValueError):
+        dpt.take_along_axis(dpt.reshape(x, (n1, n2)), ind)
+    # check compute-follows-data
+    q2 = dpctl.SyclQueue(x_dev, property="enable_profiling")
+    ind2 = dpt.zeros(1, dtype=ind_dt, sycl_queue=q2)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.take_along_axis(x, ind2)
+
+
+def test_put_along_axis():
+    get_queue_or_skip()
+
+    n0, n1, n2 = 3, 5, 7
+    x = dpt.reshape(dpt.arange(n0 * n1 * n2), (n0, n1, n2))
+    ind_dt = dpt.__array_namespace_info__().default_dtypes(
+        device=x.sycl_device
+    )["indexing"]
+    ind0 = dpt.ones((1, n1, n2), dtype=ind_dt)
+    ind1 = dpt.ones((n0, 1, n2), dtype=ind_dt)
+    ind2 = dpt.ones((n0, n1, 1), dtype=ind_dt)
+
+    xc = dpt.copy(x)
+    vals = dpt.ones(ind0.shape, dtype=x.dtype)
+    dpt.put_along_axis(xc, ind0, vals, axis=0)
+    assert dpt.all(dpt.take_along_axis(xc, ind0, axis=0) == vals)
+
+    xc = dpt.copy(x)
+    vals = dpt.ones(ind1.shape, dtype=x.dtype)
+    dpt.put_along_axis(xc, ind1, vals, axis=1)
+    assert dpt.all(dpt.take_along_axis(xc, ind1, axis=1) == vals)
+
+    xc = dpt.copy(x)
+    vals = dpt.ones(ind2.shape, dtype=x.dtype)
+    dpt.put_along_axis(xc, ind2, vals, axis=2)
+    assert dpt.all(dpt.take_along_axis(xc, ind2, axis=2) == vals)
+
+    xc = dpt.copy(x)
+    vals = dpt.ones(ind2.shape, dtype=x.dtype)
+    dpt.put_along_axis(xc, ind2, dpt.asnumpy(vals), axis=2)
+    assert dpt.all(dpt.take_along_axis(xc, ind2, axis=2) == vals)
+
+
+def test_put_along_axis_validation():
+    # validate first argument
+    with pytest.raises(TypeError):
+        dpt.put_along_axis(tuple(), list(), list())
+    get_queue_or_skip()
+    n1, n2 = 2, 5
+    x = dpt.ones(n1 * n2)
+    # validate second argument
+    with pytest.raises(TypeError):
+        dpt.put_along_axis(x, list(), list())
+    x_dev = x.sycl_device
+    info_ = dpt.__array_namespace_info__()
+    def_dtypes = info_.default_dtypes(device=x_dev)
+    ind_dt = def_dtypes["indexing"]
+    ind = dpt.zeros(1, dtype=ind_dt)
+    vals = dpt.zeros(1, dtype=x.dtype)
+    # axis validation
+    with pytest.raises(ValueError):
+        dpt.put_along_axis(x, ind, vals, axis=1)
+    # mode validation
+    with pytest.raises(ValueError):
+        dpt.put_along_axis(x, ind, vals, axis=0, mode="invalid")
+    # same array-ranks validation
+    with pytest.raises(ValueError):
+        dpt.put_along_axis(dpt.reshape(x, (n1, n2)), ind, vals)
+    # check compute-follows-data
+    q2 = dpctl.SyclQueue(x_dev, property="enable_profiling")
+    ind2 = dpt.zeros(1, dtype=ind_dt, sycl_queue=q2)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.put_along_axis(x, ind2, vals)
+
+
+def test_put_along_axis_application():
+    get_queue_or_skip()
+    info_ = dpt.__array_namespace_info__()
+    def_dtypes = info_.default_dtypes(device=None)
+    ind_dt = def_dtypes["indexing"]
+    all_perms = dpt.asarray(
+        [
+            [0, 1, 2, 3],
+            [0, 2, 1, 3],
+            [2, 0, 1, 3],
+            [2, 1, 0, 3],
+            [1, 0, 2, 3],
+            [1, 2, 0, 3],
+            [0, 1, 3, 2],
+            [0, 2, 3, 1],
+            [2, 0, 3, 1],
+            [2, 1, 3, 0],
+            [1, 0, 3, 2],
+            [1, 2, 3, 0],
+            [0, 3, 1, 2],
+            [0, 3, 2, 1],
+            [2, 3, 0, 1],
+            [2, 3, 1, 0],
+            [1, 3, 0, 2],
+            [1, 3, 2, 0],
+            [3, 0, 1, 2],
+            [3, 0, 2, 1],
+            [3, 2, 0, 1],
+            [3, 2, 1, 0],
+            [3, 1, 0, 2],
+            [3, 1, 2, 0],
+        ],
+        dtype=ind_dt,
+    )
+    p_mats = dpt.zeros((24, 4, 4), dtype=dpt.int64)
+    vals = dpt.ones((24, 4, 1), dtype=p_mats.dtype)
+    # form 24 permutation matrices
+    dpt.put_along_axis(p_mats, all_perms[..., dpt.newaxis], vals, axis=2)
+    p2 = p_mats @ p_mats
+    p4 = p2 @ p2
+    p8 = p4 @ p4
+    expected = dpt.eye(4, dtype=p_mats.dtype)[dpt.newaxis, ...]
+    assert dpt.all(p8 @ p4 == expected)
+
+
+def check__extract_impl_validation(fn):
+    x = dpt.ones(10)
+    ind = dpt.ones(10, dtype="?")
+    with pytest.raises(TypeError):
+        fn(list(), ind)
+    with pytest.raises(TypeError):
+        fn(x, list())
+    q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling")
+    ind2 = dpt.ones(10, dtype="?", sycl_queue=q2)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        fn(x, ind2)
+    with pytest.raises(ValueError):
+        fn(x, ind, 1)
+
+
+def check__nonzero_impl_validation(fn):
+    with pytest.raises(TypeError):
+        fn(list())
+
+
+def check__take_multi_index(fn):
+    x = dpt.ones(10)
+    x_dev = x.sycl_device
+    info_ = dpt.__array_namespace_info__()
+    def_dtypes = info_.default_dtypes(device=x_dev)
+    ind_dt = def_dtypes["indexing"]
+    ind = dpt.arange(10, dtype=ind_dt)
+    with pytest.raises(TypeError):
+        fn(list(), tuple(), 1)
+    with pytest.raises(ValueError):
+        fn(x, (ind,), 0, mode=2)
+    with pytest.raises(ValueError):
+        fn(x, (None,), 1)
+    with pytest.raises(IndexError):
+        fn(x, (x,), 1)
+    q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling")
+    ind2 = dpt.arange(10, dtype=ind_dt, sycl_queue=q2)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        fn(x, (ind2,), 0)
+    m = dpt.ones((10, 10))
+    ind_1 = dpt.arange(10, dtype="i8")
+    ind_2 = dpt.arange(10, dtype="u8")
+    with pytest.raises(ValueError):
+        fn(m, (ind_1, ind_2), 0)
+
+
+def check__place_impl_validation(fn):
+    with pytest.raises(TypeError):
+        fn(list(), list(), list())
+    x = dpt.ones(10)
+    with pytest.raises(TypeError):
+        fn(x, list(), list())
+    q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling")
+    mask2 = dpt.ones(10, dtype="?", sycl_queue=q2)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        fn(x, mask2, 1)
+    x2 = dpt.ones((5, 5))
+    mask2 = dpt.ones((5, 5), dtype="?")
+    with pytest.raises(ValueError):
+        fn(x2, mask2, x2, axis=1)
+
+
+def check__put_multi_index_validation(fn):
+    with pytest.raises(TypeError):
+        fn(list(), list(), 0, list())
+    x = dpt.ones(10)
+    inds = dpt.arange(10, dtype="i8")
+    vals = dpt.zeros(10)
+    # test inds which is not a tuple/list
+    fn(x, inds, 0, vals)
+    x2 = dpt.ones((5, 5))
+    ind1 = dpt.arange(5, dtype="i8")
+    ind2 = dpt.arange(5, dtype="u8")
+    with pytest.raises(ValueError):
+        fn(x2, (ind1, ind2), 0, x2)
+    with pytest.raises(TypeError):
+        # invalid index type
+        fn(x2, (ind1, list()), 0, x2)
+    with pytest.raises(ValueError):
+        # invalid mode keyword value
+        fn(x, inds, 0, vals, mode=100)
+
+
+def test__copy_utils():
+    import dpnp.tensor._copy_utils as cu
+
+    get_queue_or_skip()
+
+    check__extract_impl_validation(cu._extract_impl)
+    check__nonzero_impl_validation(cu._nonzero_impl)
+    check__take_multi_index(cu._take_multi_index)
+    check__place_impl_validation(cu._place_impl)
+    check__put_multi_index_validation(cu._put_multi_index)
+
+
+@pytest.mark.parametrize("mode", ["wrap", "clip"])
+def test_take_indices_oob_py_ssize_t(mode):
+    get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4")
+    inds1 = dpt.full(5, dpt.iinfo(dpt.uint64).max, dtype=dpt.uint64)
+    inds2 = dpt.full(5, dpt.iinfo(dpt.uint64).max, dtype=dpt.uint64)
+
+    # sweep through a small range of indices
+    # to check that OOB indices are well-behaved
+    for i in range(1, 10):
+        inds2 -= i
+        r1 = dpt.take(x, inds1, mode=mode)
+        r2 = dpt.take(x, inds2, mode=mode)
+
+        assert dpt.all(r1 == r2)
+
+
+@pytest.mark.parametrize("mode", ["wrap", "clip"])
+def test_put_indices_oob_py_ssize_t(mode):
+    get_queue_or_skip()
+
+    x = dpt.full(10, -1, dtype="i4")
+    inds = dpt.full(1, dpt.iinfo(dpt.uint64).max, dtype=dpt.uint64)
+
+    # OOB inds are positive, so always
+    # clip to the top of range
+    for i in range(1, 10):
+        inds -= i
+        dpt.put(x, inds, i, mode=mode)
+
+        assert dpt.all(x[:-1] == -1)
+        assert x[-1] == i
+
+
+def test_take_along_axis_uint64_indices():
+    get_queue_or_skip()
+
+    inds = dpt.arange(1, 10, 2, dtype="u8")
+    x = dpt.tile(dpt.asarray([0, -1], dtype="i4"), 5)
+    res = dpt.take_along_axis(x, inds)
+    assert dpt.all(res == -1)
+
+    sh0 = 2
+    inds = dpt.broadcast_to(inds, (sh0,) + inds.shape)
+    x = dpt.broadcast_to(x, (sh0,) + x.shape)
+    res = dpt.take_along_axis(x, inds, axis=1)
+    assert dpt.all(res == -1)
+
+
+def test_put_along_axis_uint64_indices():
+    get_queue_or_skip()
+
+    inds = dpt.arange(1, 10, 2, dtype="u8")
+    x = dpt.zeros(10, dtype="i4")
+    dpt.put_along_axis(x, inds, dpt.asarray(2, dtype=x.dtype))
+    expected = dpt.tile(dpt.asarray([0, 2], dtype="i4"), 5)
+    assert dpt.all(x == expected)
+
+    sh0 = 2
+    inds = dpt.broadcast_to(inds, (sh0,) + inds.shape)
+    x = dpt.zeros((sh0,) + x.shape, dtype="i4")
+    dpt.put_along_axis(x, inds, dpt.asarray(2, dtype=x.dtype), axis=1)
+    expected = dpt.tile(dpt.asarray([0, 2], dtype="i4"), (2, 5))
+    assert dpt.all(expected == x)
+
+
+@pytest.mark.parametrize("data_dt", _all_dtypes)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_take_out(data_dt, order):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    axis = 0
+    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
+    ind = dpt.arange(2, dtype="i8", sycl_queue=q)
+    out_sh = x.shape[:axis] + ind.shape + x.shape[axis + 1 :]
+    out = dpt.empty(out_sh, dtype=data_dt, sycl_queue=q)
+
+    expected = dpt.take(x, ind, axis=axis)
+
+    dpt.take(x, ind, axis=axis, out=out)
+
+    assert dpt.all(out == expected)
+
+
+@pytest.mark.parametrize("data_dt", _all_dtypes)
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_take_out_overlap(data_dt, order):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(data_dt, q)
+
+    axis = 0
+    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
+    ind = dpt.arange(2, dtype="i8", sycl_queue=q)
+    out = x[x.shape[axis] - ind.shape[axis] : x.shape[axis], :]
+
+    expected = dpt.take(x, ind, axis=axis)
+
+    dpt.take(x, ind, axis=axis, out=out)
+
+    assert dpt.all(out == expected)
+    assert dpt.all(x[x.shape[0] - ind.shape[0] : x.shape[0], :] == out)
+
+
+def test_take_out_errors():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    x = dpt.arange(10, dtype="i4", sycl_queue=q1)
+    ind = dpt.arange(2, dtype="i4", sycl_queue=q1)
+
+    with pytest.raises(TypeError):
+        dpt.take(x, ind, out=dict())
+
+    out_read_only = dpt.empty(ind.shape, dtype=x.dtype, sycl_queue=q1)
+    out_read_only.flags["W"] = False
+    with pytest.raises(ValueError):
+        dpt.take(x, ind, out=out_read_only)
+
+    out_bad_shape = dpt.empty(0, dtype=x.dtype, sycl_queue=q1)
+    with pytest.raises(ValueError):
+        dpt.take(x, ind, out=out_bad_shape)
+
+    out_bad_dt = dpt.empty(ind.shape, dtype="i8", sycl_queue=q1)
+    with pytest.raises(ValueError):
+        dpt.take(x, ind, out=out_bad_dt)
+
+    out_bad_q = dpt.empty(ind.shape, dtype=x.dtype, sycl_queue=q2)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.take(x, ind, out=out_bad_q)
+
+
+def test_getitem_impl_fn_invalid_inp():
+    get_queue_or_skip()
+
+    x = dpt.ones((10, 10), dtype="i4")
+
+    bad_ind_type = (dpt.ones((), dtype="i4"), 2.0)
+    with pytest.raises(TypeError):
+        _take_multi_index(x, bad_ind_type, 0, 0)
+
+    no_array_inds = (2, 3)
+    with pytest.raises(TypeError):
+        _take_multi_index(x, no_array_inds, 0, 0)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_linalg.py b/dpnp/tests/tensor/test_usm_ndarray_linalg.py
new file mode 100644
index 000000000000..c28754ca080f
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_linalg.py
@@ -0,0 +1,1030 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_numeric_types = [
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+def _map_int_to_type(n, dt):
+    assert isinstance(n, int)
+    assert n > 0
+    if dt == dpt.int8:
+        return ((n + 128) % 256) - 128
+    elif dt == dpt.uint8:
+        return n % 256
+    elif dt == dpt.int16:
+        return ((n + 32768) % 65536) - 32768
+    elif dt == dpt.uint16:
+        return n % 65536
+    return n
+
+
+def test_matrix_transpose():
+    get_queue_or_skip()
+
+    X = dpt.reshape(dpt.arange(2 * 3, dtype="i4"), (2, 3))
+    res = dpt.matrix_transpose(X)
+    expected_res = X.mT
+
+    assert expected_res.shape == res.shape
+    assert expected_res.flags["C"] == res.flags["C"]
+    assert expected_res.flags["F"] == res.flags["F"]
+    assert dpt.all(X.mT == res)
+
+
+def test_matrix_transpose_arg_validation():
+    get_queue_or_skip()
+
+    X = dpt.empty(5, dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.matrix_transpose(X)
+
+    X = {}
+    with pytest.raises(TypeError):
+        dpt.matrix_transpose(X)
+
+    X = dpt.empty((5, 5), dtype="i4")
+    assert isinstance(dpt.matrix_transpose(X), dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_matmul_simple(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, m = 235, 17
+    m1 = dpt.zeros((m, n), dtype=dtype)
+    m2 = dpt.zeros((n, m), dtype=dtype)
+
+    dt = m1.dtype
+    if dt.kind in "ui":
+        n1 = min(n, dpt.iinfo(dt).max)
+    else:
+        n1 = n
+    m1[:, :n1] = dpt.ones((m, n1), dtype=dt)
+    m2[:n1, :] = dpt.ones((n1, m), dtype=dt)
+
+    for k in [1, 2, 3, 4, 7, 8, 9, 15, 16, 17]:
+        r = dpt.matmul(m1[:k, :], m2[:, :k])
+        assert dpt.all(r == dpt.full((k, k), fill_value=n1, dtype=dt))
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_matmul_nilpotent1(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n = 77
+    N_mat = dpt.eye(n, k=1, dtype=dtype)
+    I_mat = dpt.eye(n, dtype=dtype)
+    R_mat = dpt.eye(n, dtype=dtype)
+    for _ in range(n + 1):
+        R_mat = I_mat + dpt.matmul(N_mat, R_mat)
+
+    assert dpt.allclose(dpt.matmul(I_mat - N_mat, R_mat), I_mat)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_matmul_nilpotent2(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n = 128
+    u = dpt.ones((n, 1), dtype=dtype)
+    v = dpt.ones((1, n), dtype=dtype)
+
+    uv = dpt.matmul(u, v)
+    uv_ref = u * v
+
+    assert dpt.allclose(uv, uv_ref)
+
+
+def test_matmul_null_axis():
+    get_queue_or_skip()
+    n = 3
+
+    A_mat = dpt.ones((n, 0), dtype="f4")
+    B_mat = dpt.ones((0, 1), dtype="f4")
+
+    R_mat = dpt.matmul(A_mat, B_mat)
+    assert R_mat.shape == (n, 1)
+
+    R_mat = dpt.matmul(A_mat, B_mat[:, :0])
+    assert R_mat.shape == (n, 0)
+
+
+@pytest.mark.parametrize("dtype", ["i4", "f4"])
+def test_matmul_dims(dtype):
+    get_queue_or_skip()
+
+    n, m, k, b = 4, 5, 7, 3
+    v = dpt.ones(k, dtype=dtype)
+    m1 = dpt.ones((n, k), dtype=dtype)
+    m2 = dpt.ones((k, m), dtype=dtype)
+    st1 = dpt.ones((b, n, k), dtype=dtype)
+    st2 = dpt.ones((b, k, m), dtype=dtype)
+
+    r = dpt.matmul(v, v)
+    assert r.shape == ()
+    assert dpt.round(r) == k
+
+    r = dpt.matmul(m1, v)
+    assert r.shape == (n,)
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(v, m2)
+    assert r.shape == (m,)
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(m1, m2)
+    assert r.shape == (
+        n,
+        m,
+    )
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(v, st2)
+    assert r.shape == (
+        b,
+        m,
+    )
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(st1, v)
+    assert r.shape == (
+        b,
+        n,
+    )
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(st1, m2)
+    assert r.shape == (
+        b,
+        n,
+        m,
+    )
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(m1, st2)
+    assert r.shape == (
+        b,
+        n,
+        m,
+    )
+    assert dpt.all(dpt.round(r) == k)
+
+    r = dpt.matmul(st1, st2)
+    assert r.shape == (
+        b,
+        n,
+        m,
+    )
+    assert dpt.all(dpt.round(r) == k)
+
+
+def test_matmul_arg_validation():
+    get_queue_or_skip()
+
+    s1, s2 = dpt.ones(tuple()), dpt.zeros(tuple())
+    v1, v2 = dpt.ones(16), dpt.zeros(16)
+
+    with pytest.raises(ValueError):
+        dpt.matmul(s1, v2)
+
+    with pytest.raises(ValueError):
+        dpt.matmul(v1, s2)
+
+    with pytest.raises(TypeError):
+        dpt.matmul(dict(), v2)
+
+    with pytest.raises(TypeError):
+        dpt.matmul(v2, None)
+
+
+def test_matmul_dims_validation():
+    get_queue_or_skip()
+
+    m1 = dpt.ones((16, 16))
+    m2 = dpt.ones((16, 16))
+
+    # contraction dimensions mismatch
+    with pytest.raises(ValueError):
+        dpt.matmul(m1[:, :7], m2[:3, :])
+
+    m1 = dpt.ones((3, 4, 5))
+    m2 = dpt.ones((2, 5, 3))
+    # broadcasting dimensions mismatch
+    with pytest.raises(ValueError):
+        dpt.matmul(m1, m2)
+
+
+def test_matmul_broadcasting():
+    get_queue_or_skip()
+
+    for dt1, dt2 in [
+        (dpt.int16, dpt.int32),
+        (dpt.float32, dpt.int16),
+        (dpt.int32, dpt.uint32),
+    ]:
+        m1 = dpt.ones((7, 11, 16), dtype=dt1)
+        m2 = dpt.ones((16, 13), dtype=dt2)
+
+        r = dpt.matmul(m1, m2[dpt.newaxis, ...])
+
+        assert r.shape == (7, 11, 13)
+
+
+@pytest.mark.parametrize("dtype", ["i4", "i8", "f4", "c8"])
+def test_matmul_strided(dtype):
+    get_queue_or_skip()
+
+    m1_shape = (14, 22, 32)
+    m1_size = 1
+    for el in m1_shape:
+        m1_size = m1_size * el
+
+    m1 = dpt.remainder(dpt.arange(1, m1_size + 1, dtype="i8"), 13)
+    m1_orig = dpt.reshape(dpt.astype(m1, dtype), m1_shape)
+    m2_orig = dpt.ones((14, 16, 13), dtype=dtype)
+
+    m1 = m1_orig[::2, ::-2, ::2]
+    m2 = m2_orig[::2, :, :]
+    r = dpt.matmul(m1, m2)
+
+    assert r.shape == m1.shape[:2] + m2.shape[-1:]
+    ref = np.matmul(dpt.asnumpy(m1), dpt.asnumpy(m2))
+    assert np.allclose(dpt.asnumpy(r), ref)
+
+    m1 = m1_orig[::2, ::2, ::-2]
+    m2 = m2_orig[::2, :, :]
+    r = dpt.matmul(m1, m2)
+
+    assert r.shape == m1.shape[:2] + m2.shape[-1:]
+    ref = np.matmul(dpt.asnumpy(m1), dpt.asnumpy(m2))
+    assert np.allclose(dpt.asnumpy(r), ref)
+
+    m1 = m1_orig[::-2, ::2, ::2]
+    m2 = m2_orig[::-2, :, :]
+    r = dpt.matmul(m1, m2)
+
+    assert r.shape == m1.shape[:2] + m2.shape[-1:]
+    ref = np.matmul(dpt.asnumpy(m1), dpt.asnumpy(m2))
+    assert np.allclose(dpt.asnumpy(r), ref)
+
+
+def test_matmul_out():
+    get_queue_or_skip()
+
+    m1 = (
+        dpt.arange(14, dtype="f4")[:, dpt.newaxis, dpt.newaxis]
+        + dpt.arange(17, dtype="f4")[dpt.newaxis, :, dpt.newaxis]
+        + dpt.arange(128, dtype="f4")[dpt.newaxis, dpt.newaxis, :]
+    )
+    assert m1.shape == (14, 17, 128)
+    m2 = dpt.tile(
+        dpt.reshape(dpt.asarray([1, 2], dtype="f4"), (2, 1, 1)), (7, 128, 13)
+    )
+    assert m2.shape == (14, 128, 13)
+
+    buf = dpt.zeros((2 * 14, 3 * 17, 13), dtype="f4")
+    res = dpt.matmul(m1, m2, out=buf[::-2, 1::3, :])
+
+    assert dpt.allclose(res, buf[::-2, 1::3, :])
+    assert dpt.allclose(dpt.zeros_like(res), buf[::-2, 0::3, :])
+    assert dpt.allclose(dpt.zeros_like(res), buf[::-2, 2::3, :])
+
+    m1_np = dpt.asnumpy(m1)
+    ref = np.matmul(m1_np, dpt.asnumpy(m2))
+    assert np.allclose(ref, dpt.asnumpy(res))
+
+    res = dpt.matmul(m1[:, :10, :10], m1[:, :10, :10].mT, out=m1[:, :10, :10])
+    ref = np.matmul(
+        m1_np[:, :10, :10], np.transpose(m1_np[:, :10, :10], (0, 2, 1))
+    )
+    assert np.allclose(ref, dpt.asnumpy(res))
+
+
+def test_matmul_readonly_out():
+    get_queue_or_skip()
+    m = dpt.ones((10, 10), dtype=dpt.int32)
+    r = dpt.empty_like(m)
+    r.flags["W"] = False
+
+    with pytest.raises(ValueError):
+        dpt.matmul(m, m, out=r)
+
+
+def test_matmul_dtype():
+    get_queue_or_skip()
+
+    for dt1, dt2 in [
+        (dpt.int32, dpt.int16),
+        (dpt.int16, dpt.int32),
+        (dpt.float32, dpt.int16),
+        (dpt.int32, dpt.float32),
+    ]:
+        m1 = dpt.ones((10, 10), dtype=dt1)
+        m2 = dpt.ones((10, 10), dtype=dt2)
+
+        for ord in ["C", "A", "F", "K"]:
+            r = dpt.matmul(m1, m2, dtype=dpt.float32, order=ord)
+            assert r.dtype == dpt.float32
+
+
+@pytest.mark.parametrize("dt1", _numeric_types)
+@pytest.mark.parametrize("dt2", _numeric_types)
+@pytest.mark.parametrize("order", ["C", "K"])
+def test_matmul_type_promotion(dt1, dt2, order):
+    get_queue_or_skip()
+
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt1, q)
+    skip_if_dtype_not_supported(dt2, q)
+
+    b, n, k, m = 8, 10, 17, 10
+    m1 = dpt.ones((1, n, k), dtype=dt1)
+    m2 = dpt.ones((b, k, m), dtype=dt2)
+    expected_dt = dpt.result_type(m1, m2)
+
+    r = dpt.matmul(m1, m2, order=order)
+    assert r.shape == (b, n, m)
+    assert r.dtype == expected_dt
+
+    m1 = dpt.ones((b, n, k), dtype=dt1)
+    m2 = dpt.ones((1, k, m), dtype=dt2)
+
+    r = dpt.matmul(m1, m2, order=order)
+    assert r.shape == (b, n, m)
+    assert r.dtype == expected_dt
+
+    m1 = dpt.ones((n, k), dtype=dt1)
+    m2 = dpt.ones((k, m), dtype=dt2)
+
+    r = dpt.matmul(m1, m2, order=order)
+    assert r.shape == (n, m)
+    assert r.dtype == expected_dt
+
+
+def test_matmul_invalid_dtype():
+    get_queue_or_skip()
+
+    m1 = dpt.zeros((10, 10), dtype="f4")
+    m2 = dpt.zeros((10, 10), dtype="f4")
+    m3 = dpt.zeros((10, 10), dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.matmul(m1, m2, dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.matmul(m1, m3, dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.matmul(m3, m1, dtype="i4")
+
+
+def test_matmul_out_errors():
+    q1 = get_queue_or_skip()
+    q2 = dpctl.SyclQueue()
+
+    sh = (10, 10)
+    dt = "i4"
+    m1 = dpt.zeros(sh, dtype=dt, sycl_queue=q1)
+    m2 = dpt.zeros(sh, dtype=dt, sycl_queue=q1)
+
+    with pytest.raises(TypeError):
+        dpt.matmul(m1, m2, out=dict())
+
+    with pytest.raises(ValueError):
+        dpt.matmul(m1, m2, out=dpt.empty((10,), dtype=dt, sycl_queue=q1))
+
+    with pytest.raises(ValueError):
+        dpt.matmul(m1, m2, out=dpt.empty(sh, dtype="f4", sycl_queue=q1))
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.matmul(m1, m2, out=dpt.empty(sh, dtype=dt, sycl_queue=q2))
+
+
+def test_matmul_order():
+    get_queue_or_skip()
+
+    sh = (
+        10,
+        10,
+    )
+    sh2 = tuple(2 * dim for dim in sh)
+    n = sh[-1]
+
+    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
+        ar1 = dpt.ones(sh, dtype=dt1, order="C")
+        ar2 = dpt.ones(sh, dtype=dt2, order="C")
+        r1 = dpt.matmul(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.matmul(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.matmul(ar1, ar2, order="A")
+        assert r3.flags.c_contiguous
+        r4 = dpt.matmul(ar1, ar2, order="K")
+        assert r4.flags.c_contiguous
+
+        ar1 = dpt.ones(sh, dtype=dt1, order="F")
+        ar2 = dpt.ones(sh, dtype=dt2, order="F")
+        r1 = dpt.matmul(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.matmul(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.matmul(ar1, ar2, order="A")
+        assert r3.flags.f_contiguous
+        r4 = dpt.matmul(ar1, ar2, order="K")
+        assert r4.flags.f_contiguous
+
+        ar1 = dpt.ones(sh2, dtype=dt1, order="C")[:10, ::-2]
+        ar2 = dpt.ones(sh2, dtype=dt2, order="C")[:10, ::-2]
+        r4 = dpt.matmul(ar1, ar2, order="K")
+        assert r4.strides == (n, -1)
+        r5 = dpt.matmul(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+        ar1 = dpt.ones(sh2, dtype=dt1, order="C")[:10, ::-2].mT
+        ar2 = dpt.ones(sh2, dtype=dt2, order="C")[:10, ::-2].mT
+        r4 = dpt.matmul(ar1, ar2, order="K")
+        assert r4.strides == (-1, n)
+        r5 = dpt.matmul(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+
+def test_matmul_invalid_order():
+    get_queue_or_skip()
+
+    sh = (
+        10,
+        10,
+    )
+    dt = "i4"
+
+    ar1 = dpt.ones(sh, dtype=dt, order="C")
+    ar2 = dpt.ones(sh, dtype=dt, order="C")
+    r = dpt.matmul(ar1, ar2, order="invalid")
+    assert r.flags.c_contiguous
+
+    ar1 = dpt.ones(sh, dtype=dt, order="F")
+    ar2 = dpt.ones(sh, dtype=dt, order="F")
+    r = dpt.matmul(ar1, ar2, order="invalid")
+    assert r.flags.f_contiguous
+
+
+def test_matmul_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = dpctl.SyclQueue()
+
+    sh = (
+        10,
+        10,
+    )
+    dt = "i4"
+    m1 = dpt.zeros(sh, dtype=dt, sycl_queue=q1)
+    m2 = dpt.zeros(sh, dtype=dt, sycl_queue=q2)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.matmul(m1, m2)
+
+
+def test_matmul_inplace_broadcasting():
+    get_queue_or_skip()
+
+    sh = (3, 5, 5)
+    dt = "i4"
+
+    m1 = dpt.ones((3, 5, 5), dtype=dt)
+    m2 = dpt.ones((1, 5, 5), dtype=dt)
+    m1 @= m2
+    assert dpt.all(m1 == dpt.full(sh, 5, dtype=dt))
+
+
+def test_matmul_prepend_dims():
+    get_queue_or_skip()
+
+    n = 5
+    for dt1, dt2 in [
+        (dpt.int32, dpt.int32),
+        (dpt.int32, dpt.int64),
+        (dpt.int64, dpt.int32),
+        (dpt.int32, dpt.uint32),
+    ]:
+        m = dpt.ones((n, 4), dtype=dt1)
+        v = dpt.ones((4,), dtype=dt2)
+        r = dpt.matmul(m, v)
+        assert r.shape == (n,)
+
+        r = dpt.matmul(v, m.mT)
+        assert r.shape == (n,)
+
+
+def test_matmul_inplace_same_tensors():
+    get_queue_or_skip()
+
+    n = 5
+    sh = (
+        n,
+        n,
+    )
+
+    ar1 = dpt.ones(sh, dtype="i4")
+    ar1 @= ar1
+    assert dpt.all(ar1 == dpt.full(sh, n, dtype="i4"))
+
+    ar1 = dpt.ones(sh, dtype="i8")
+    ar2 = dpt.ones(sh, dtype="i4")
+    dpt.matmul(ar1, ar2, out=ar1)
+    assert dpt.all(ar1 == dpt.full(sh, n, dtype=ar1.dtype))
+
+    ar1 = dpt.ones(sh, dtype="i4")
+    ar2 = dpt.ones(sh, dtype="i8")
+    dpt.matmul(ar1, ar2, out=ar2)
+    assert dpt.all(ar2 == dpt.full(sh, n, dtype=ar2.dtype))
+
+
+@pytest.fixture
+def random_matrix():
+    rs = np.random.RandomState(seed=123456)
+    m_np = rs.randint(low=0, high=6, size=(400, 400))
+    return m_np
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_matmul_largish_square(dtype, random_matrix):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m_np = random_matrix.astype(dtype)
+    x_np = np.matmul(m_np.T, m_np)
+
+    m = dpt.asarray(m_np)
+    mT = dpt.asarray(m.mT, copy=True, order="C")
+    x1 = dpt.matmul(m.mT, m)
+    x2 = dpt.matmul(mT, m)
+
+    tol = 0
+    if dpt.isdtype(x2.dtype, ("real floating", "complex floating")):
+        tol = 32 * dpt.finfo(x2.dtype).eps
+
+    assert dpt.allclose(x1, x2, atol=tol, rtol=tol)
+    assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol)
+
+    # check stided input
+    m_np = m_np[:-1, :-1]
+    x_np = np.matmul(m_np.T, m_np)
+
+    m = m[:-1, :-1]
+    mT = dpt.asarray(m.mT, copy=True, order="C")
+    x1 = dpt.matmul(m.mT, m)
+    x2 = dpt.matmul(mT, m)
+
+    assert dpt.allclose(x1, x2, atol=tol, rtol=tol)
+    assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_matmul_largish_rect(dtype, random_matrix):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m_np = random_matrix.astype(dtype)[:, :-1]
+    x_np = np.matmul(m_np.T[:-2, :], m_np)
+
+    m = dpt.asarray(m_np)
+    mmT = m.mT[:-2, :]
+    mT = dpt.asarray(mmT, copy=True, order="C")
+    x1 = dpt.matmul(mmT, m)
+    x2 = dpt.matmul(mT, m)
+
+    tol = 0
+    if dpt.isdtype(x2.dtype, ("real floating", "complex floating")):
+        tol = 32 * dpt.finfo(x2.dtype).eps
+
+    assert dpt.allclose(x1, x2, atol=tol, rtol=tol)
+    assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol)
+
+    m_np = m_np[:-1, :-1]
+    x_np = np.matmul(m_np.T[:-2, :], m_np)
+
+    m = m[:-1, :-1]
+    mmT = m.mT[:-2, :]
+    mT = dpt.asarray(mmT, copy=True, order="C")
+    x1 = dpt.matmul(mmT, m)
+    x2 = dpt.matmul(mT, m)
+
+    assert dpt.allclose(x1, x2, atol=tol, rtol=tol)
+    assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_tensordot_outer(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    t1 = dpt.ones((3, 8), dtype=dtype)
+    t2 = dpt.ones((4, 12), dtype=dtype)
+
+    r = dpt.tensordot(t1, t2, axes=0)
+    assert r.shape == t1.shape + t2.shape
+    assert dpt.allclose(r, dpt.ones_like(r))
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_tensordot_inner(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    t1 = dpt.ones((3, 8), dtype=dtype)
+    t2 = dpt.ones((4, 8), dtype=dtype)
+
+    r = dpt.tensordot(t1, t2.mT, axes=1)
+    assert r.shape == t1.shape[:1] + t2.shape[:1]
+    assert dpt.allclose(r, dpt.full_like(r, fill_value=t1.shape[1]))
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_tensordot_double(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    t1 = dpt.ones((2, 4, 8), dtype=dtype)
+    t2 = dpt.ones((3, 4, 8), dtype=dtype)
+
+    r = dpt.tensordot(t1, dpt.permute_dims(t2, (1, 2, 0)), axes=2)
+    assert r.shape == t1.shape[:1] + t2.shape[:1]
+    expected = dpt.prod(dpt.asarray(t1.shape[1:]))
+    assert dpt.allclose(r, dpt.full_like(r, fill_value=expected))
+
+
+@pytest.mark.parametrize("dtype", ["i4", "f4"])
+def test_tensordot_axes_sequence(dtype):
+    get_queue_or_skip()
+
+    r = 4
+    t1 = dpt.ones((2, 2, 4, 3), dtype=dtype)
+    t2 = dpt.ones((3, 2, 4, 3), dtype=dtype)
+
+    assert len(t1.shape) == r
+    assert len(t2.shape) == r
+
+    expected = dpt.prod(dpt.asarray(t1.shape[1:]))
+    ps1 = itertools.permutations(range(r))
+    ps2 = itertools.permutations(range(r))
+
+    for p1 in ps1:
+        assert len(p1) == r
+        inv_p1 = sorted(range(r), key=p1.__getitem__)
+        u1 = dpt.permute_dims(t1, p1)
+        x1_axes = inv_p1[1:]
+        for p2 in ps2:
+            inv_p2 = sorted(range(r), key=p2.__getitem__)
+            u2 = dpt.permute_dims(t2, p2)
+            x2_axes = inv_p2[1:]
+
+            tdr = dpt.tensordot(u1, u2, axes=(x1_axes, x2_axes))
+            assert tdr.shape == t1.shape[:1] + t2.shape[:1]
+            assert dpt.allclose(tdr, dpt.full_like(tdr, fill_value=expected))
+
+
+def test_tensordot_validation():
+    get_queue_or_skip()
+
+    with pytest.raises(TypeError):
+        dpt.tensordot(dict(), dict())
+
+    t1 = dpt.empty((10, 10, 10))
+    with pytest.raises(TypeError):
+        dpt.tensordot(t1, dict())
+
+    t2 = dpt.empty((10, 10, 10))
+    q = dpctl.SyclQueue(t2.sycl_context, t2.sycl_device, property="in_order")
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.tensordot(t1, t2.to_device(q))
+
+    invalid_axes = (
+        1,
+        2,
+        3,
+    )
+    with pytest.raises(ValueError):
+        dpt.tensordot(t1, t2, axes=invalid_axes)
+
+    invalid_axes = 5.2
+    with pytest.raises(TypeError):
+        dpt.tensordot(t1, t2, axes=invalid_axes)
+
+    invalid_axes = (
+        (1,),
+        (
+            0,
+            2,
+        ),
+    )
+    with pytest.raises(ValueError):
+        dpt.tensordot(t1, t2, axes=invalid_axes)
+
+    with pytest.raises(ValueError):
+        dpt.tensordot(t1[..., :5], t2)
+
+
+def test_tensordot_promotion():
+    get_queue_or_skip()
+
+    t1 = dpt.zeros((10, 10), dtype="i4")
+    t2 = dpt.zeros((10, 10), dtype="i8")
+
+    r1 = dpt.tensordot(t1, t2)
+    assert r1.dtype == t2.dtype
+
+    r2 = dpt.tensordot(t2, t1)
+    assert r2.dtype == t2.dtype
+
+    t3 = dpt.zeros((10, 10), dtype="u4")
+    r3 = dpt.tensordot(t1, t3)
+    assert r3.dtype == dpt.result_type(t1, t3)
+
+
+def test_tensordot_axes_errors():
+    get_queue_or_skip()
+
+    m1 = dpt.zeros((10, 10), dtype="i4")
+    m2 = dpt.zeros((10, 10), dtype="i4")
+
+    with pytest.raises(ValueError):
+        dpt.tensordot(m1, m2, axes=-1)
+
+
+# tests for gh-1570
+def test_tensordot_gemm_small_k_m():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray(1, dtype="i2")
+    x2 = dpt.asarray([0, 1, 0, 0], dtype="i2")
+
+    res = dpt.tensordot(x1, x2, axes=0)
+    assert dpt.all(x2 == res)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_vecdot_1d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n = 511
+    v1 = dpt.ones(n, dtype=dtype)
+
+    v2 = dpt.ones(n, dtype=dtype)
+
+    r = dpt.vecdot(v1, v2)
+    expected_value = _map_int_to_type(n, r.dtype)
+    assert r == expected_value
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_vecdot_3d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m1, m2, n = 7, 3, 511
+    v1 = dpt.ones((m1, m2, n), dtype=dtype)
+
+    v2 = dpt.ones((m1, m2, n), dtype=dtype)
+
+    r = dpt.vecdot(v1, v2)
+
+    assert r.shape == (
+        m1,
+        m2,
+    )
+    expected_value = _map_int_to_type(n, r.dtype)
+    assert dpt.all(r == expected_value)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_vecdot_axis(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m1, m2, n = 7, 3, 511
+    v1 = dpt.ones((m1, n, m2), dtype=dtype)
+
+    v2 = dpt.ones((m1, n, m2), dtype=dtype)
+
+    r = dpt.vecdot(v1, v2, axis=-2)
+
+    assert r.shape == (
+        m1,
+        m2,
+    )
+    expected_value = _map_int_to_type(n, r.dtype)
+    assert dpt.all(r == expected_value)
+
+
+@pytest.mark.parametrize("dtype", _numeric_types)
+def test_vecdot_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m1, m2, n = 7, 3, 511
+    list1 = [1, 0, 2, 0]
+    pattern1 = dpt.asarray(list1, dtype=dtype)
+    n_padded1 = pattern1.size * (1 + ((n - 1) // pattern1.size))
+    v1 = dpt.tile(dpt.reshape(pattern1, (1, -1, 1)), (m1, n_padded1, m2))[
+        ::-1, :n, :
+    ]
+
+    list2 = [1, 2, 1, 2]
+    pattern2 = dpt.asarray(list2, dtype=dtype)
+    n_padded2 = pattern2.size * (1 + ((n - 1) // pattern2.size))
+    v2 = dpt.tile(dpt.reshape(pattern2, (1, -1, 1)), (m1, n_padded2, m2))[
+        :, :n, ::-1
+    ]
+
+    r = dpt.vecdot(v1, v2, axis=-2)
+
+    ref = sum(
+        el1 * el2
+        for el1, el2 in zip((list1 * n_padded1)[:n], (list2 * n_padded1)[:n])
+    )
+
+    assert r.shape == (
+        m1,
+        m2,
+    )
+    ref = _map_int_to_type(ref, r.dtype)
+    assert dpt.all(r == ref)
+
+
+def test_vector_arg_validation():
+    get_queue_or_skip()
+
+    s1, s2 = dpt.ones(tuple()), dpt.zeros(tuple())
+    v1, v2 = dpt.ones(16), dpt.zeros(16)
+
+    with pytest.raises(ValueError):
+        dpt.vecdot(s1, v2)
+
+    with pytest.raises(ValueError):
+        dpt.vecdot(v1, s2)
+
+    with pytest.raises(TypeError):
+        dpt.vecdot(dict(), v2)
+
+    with pytest.raises(TypeError):
+        dpt.vecdot(v2, None)
+
+    with pytest.raises(ValueError):
+        dpt.vecdot(v1[:5], v2[:4])
+
+    with pytest.raises(ValueError):
+        dpt.vecdot(v1, v2, axis=2)
+
+    with pytest.raises(ValueError):
+        dpt.vecdot(v1, v2, axis=-2)
+
+    q = dpctl.SyclQueue(
+        v2.sycl_context, v2.sycl_device, property="enable_profiling"
+    )
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.vecdot(v1, v2.to_device(q))
+
+    m1 = dpt.empty((10, 5))
+    m2 = dpt.empty((5, 5))
+    with pytest.raises(ValueError):
+        dpt.vecdot(m1, m2, axis=-1)
+
+
+def test_vecdot_broadcast():
+    get_queue_or_skip()
+
+    for dt1, dt2 in [
+        (dpt.int32, dpt.int32),
+        (dpt.int32, dpt.int64),
+        (dpt.int64, dpt.int32),
+        (dpt.int32, dpt.uint32),
+    ]:
+        m1 = dpt.zeros((1, 5), dtype=dt1)
+        m2 = dpt.zeros((5, 5), dtype=dt2)
+        r1 = dpt.vecdot(m1, m2, axis=-1)
+        r2 = dpt.vecdot(m2, m1, axis=-1)
+        assert r1.shape == r2.shape
+
+
+@pytest.mark.parametrize("dt1", _numeric_types)
+@pytest.mark.parametrize("dt2", _numeric_types)
+def test_vecdot_type_promotion(dt1, dt2):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt1, q)
+    skip_if_dtype_not_supported(dt2, q)
+
+    v1 = dpt.ones(128, dtype=dt1)
+    v2 = dpt.ones(128, dtype=dt2)
+
+    r = dpt.vecdot(v1, v2)
+    mul = v1 * v2
+    assert r.shape == ()
+    assert r.dtype == mul.dtype
+    assert dpt.allclose(r, dpt.sum(mul, dtype=mul.dtype))
+
+
+def test_vecdot_broadcast_o1_buffer():
+    get_queue_or_skip()
+
+    v1 = dpt.arange(10, dtype="i2")
+    v2 = dpt.ones((5, 10), dtype="i4")
+
+    res1 = dpt.vecdot(v1, v2)
+    assert res1.shape == (5,)
+
+    res2 = dpt.vecdot(v2, v1)
+    assert res2.shape == (5,)
+
+
+def test_vecdot_contig_small():
+    get_queue_or_skip()
+
+    n = 1
+    for dt in [dpt.int16, dpt.int32, dpt.complex64]:
+        v1 = dpt.zeros((10, n), dtype=dt)
+        v2 = dpt.ones_like(v1, dtype=dt)
+        v1[-1] = 1
+        res = dpt.vecdot(v1, v2)
+        assert dpt.all(res[:-1] == 0)
+        assert res[-1] == n
+
+
+def test_matmul_out_appended_axes():
+    get_queue_or_skip()
+
+    n0, n1, n2 = 4, 10, 5
+    # vm
+    x1 = dpt.ones(n1, dtype="i4")
+    x2 = dpt.ones((n0, n1, n2), dtype="i4")
+    out = dpt.empty((n0, n2), dtype="i4")
+
+    dpt.matmul(x1, x2, out=out)
+    assert dpt.all(out == n1)
+
+    # mv
+    x2 = x2.mT
+    x1, x2 = x2, x1
+    dpt.matmul(x1, x2, out=out)
+    assert dpt.all(out == n1)
+
+    # vv
+    x1 = dpt.ones(n1, dtype="i4")
+    out = dpt.empty((), dtype="i4")
+    dpt.matmul(x1, x2, out=out)
+    assert out == n1
diff --git a/dpnp/tests/tensor/test_usm_ndarray_manipulation.py b/dpnp/tests/tensor/test_usm_ndarray_manipulation.py
new file mode 100644
index 000000000000..0375bb446370
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_manipulation.py
@@ -0,0 +1,1608 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import dpctl
+import numpy as np
+import pytest
+from numpy.testing import assert_, assert_array_equal, assert_raises_regex
+
+import dpnp.tensor as dpt
+from dpnp.tensor._numpy_helper import AxisError
+
+from .helper import get_queue_or_skip
+
+
+def test_permute_dims_incorrect_type():
+    X_list = list([[1, 2, 3], [4, 5, 6]])
+    X_tuple = tuple(X_list)
+    Xnp = np.array(X_list)
+
+    pytest.raises(TypeError, dpt.permute_dims, X_list, (1, 0))
+    pytest.raises(TypeError, dpt.permute_dims, X_tuple, (1, 0))
+    pytest.raises(TypeError, dpt.permute_dims, Xnp, (1, 0))
+
+
+def test_permute_dims_empty_array():
+    q = get_queue_or_skip()
+
+    Xnp = np.empty((10, 0))
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.permute_dims(X, (1, 0))
+    Ynp = np.transpose(Xnp, (1, 0))
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_permute_dims_0d_1d():
+    q = get_queue_or_skip()
+
+    Xnp_0d = np.array(1, dtype="int64")
+    X_0d = dpt.asarray(Xnp_0d, sycl_queue=q)
+    Y_0d = dpt.permute_dims(X_0d, ())
+    assert_array_equal(dpt.asnumpy(Y_0d), dpt.asnumpy(X_0d))
+
+    Xnp_1d = np.random.randint(0, 2, size=6, dtype="int64")
+    X_1d = dpt.asarray(Xnp_1d, sycl_queue=q)
+    Y_1d = dpt.permute_dims(X_1d, (0))
+    assert_array_equal(dpt.asnumpy(Y_1d), dpt.asnumpy(X_1d))
+
+    pytest.raises(ValueError, dpt.permute_dims, X_1d, ())
+    pytest.raises(AxisError, dpt.permute_dims, X_1d, (1))
+    pytest.raises(ValueError, dpt.permute_dims, X_1d, (1, 0))
+    pytest.raises(
+        ValueError, dpt.permute_dims, dpt.reshape(X_1d, (2, 3)), (1, 1)
+    )
+
+
+@pytest.mark.parametrize("shapes", [(2, 2), (1, 4), (3, 3, 3), (4, 1, 3)])
+def test_permute_dims_2d_3d(shapes):
+    q = get_queue_or_skip()
+
+    Xnp_size = np.prod(shapes)
+
+    Xnp = np.random.randint(0, 2, size=Xnp_size, dtype="int64").reshape(shapes)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    X_ndim = X.ndim
+    if X_ndim == 2:
+        Y = dpt.permute_dims(X, (1, 0))
+        Ynp = np.transpose(Xnp, (1, 0))
+    elif X_ndim == 3:
+        X = dpt.asarray(Xnp, sycl_queue=q)
+        Y = dpt.permute_dims(X, (2, 0, 1))
+        Ynp = np.transpose(Xnp, (2, 0, 1))
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_expand_dims_incorrect_type():
+    X_list = [1, 2, 3, 4, 5]
+    with pytest.raises(TypeError):
+        dpt.permute_dims(X_list, axis=1)
+
+
+def test_expand_dims_0d():
+    q = get_queue_or_skip()
+
+    Xnp = np.array(1, dtype="int64")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Y = dpt.expand_dims(X, axis=0)
+    Ynp = np.expand_dims(Xnp, axis=0)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Y = dpt.expand_dims(X, axis=-1)
+    Ynp = np.expand_dims(Xnp, axis=-1)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    pytest.raises(AxisError, dpt.expand_dims, X, axis=1)
+    pytest.raises(AxisError, dpt.expand_dims, X, axis=-2)
+
+
+@pytest.mark.parametrize("shapes", [(3,), (3, 3), (3, 3, 3)])
+def test_expand_dims_1d_3d(shapes):
+    q = get_queue_or_skip()
+
+    Xnp_size = np.prod(shapes)
+
+    Xnp = np.random.randint(0, 2, size=Xnp_size, dtype="int64").reshape(shapes)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    shape_len = len(shapes)
+    for axis in range(-shape_len - 1, shape_len):
+        Y = dpt.expand_dims(X, axis=axis)
+        Ynp = np.expand_dims(Xnp, axis=axis)
+        assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    pytest.raises(AxisError, dpt.expand_dims, X, axis=shape_len + 1)
+    pytest.raises(AxisError, dpt.expand_dims, X, axis=-shape_len - 2)
+
+
+@pytest.mark.parametrize(
+    "axes", [(0, 1, 2), (0, -1, -2), (0, 3, 5), (0, -3, -5)]
+)
+def test_expand_dims_tuple(axes):
+    q = get_queue_or_skip()
+
+    Xnp = np.empty((3, 3, 3), dtype="u1")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.expand_dims(X, axis=axes)
+    Ynp = np.expand_dims(Xnp, axis=axes)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_expand_dims_incorrect_tuple():
+    try:
+        X = dpt.empty((3, 3, 3), dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    with pytest.raises(AxisError):
+        dpt.expand_dims(X, axis=(0, -6))
+    with pytest.raises(AxisError):
+        dpt.expand_dims(X, axis=(0, 5))
+
+    with pytest.raises(ValueError):
+        dpt.expand_dims(X, axis=(1, 1))
+
+
+def test_squeeze_incorrect_type():
+    X_list = [1, 2, 3, 4, 5]
+    with pytest.raises(TypeError):
+        dpt.permute_dims(X_list, 1)
+
+
+def test_squeeze_0d():
+    q = get_queue_or_skip()
+
+    Xnp = np.array(1)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.squeeze(X)
+    Ynp = Xnp.squeeze()
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Y = dpt.squeeze(X, 0)
+    Ynp = Xnp.squeeze(0)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Y = dpt.squeeze(X, (0))
+    Ynp = Xnp.squeeze(0)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Y = dpt.squeeze(X, -1)
+    Ynp = Xnp.squeeze(-1)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    pytest.raises(AxisError, dpt.squeeze, X, 1)
+    pytest.raises(AxisError, dpt.squeeze, X, -2)
+    pytest.raises(AxisError, dpt.squeeze, X, (1))
+    pytest.raises(AxisError, dpt.squeeze, X, (-2))
+    pytest.raises(ValueError, dpt.squeeze, X, (0, 0))
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        (0),
+        (1),
+        (1, 2),
+        (2, 1),
+        (1, 1),
+        (2, 2),
+        (1, 0),
+        (0, 1),
+        (1, 2, 1),
+        (2, 1, 2),
+        (2, 2, 2),
+        (1, 1, 1),
+        (1, 0, 1),
+        (0, 1, 0),
+    ],
+)
+def test_squeeze_without_axes(shapes):
+    q = get_queue_or_skip()
+
+    Xnp = np.empty(shapes, dtype="u1")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.squeeze(X)
+    Ynp = Xnp.squeeze()
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("axes", [0, 2, (0), (2), (0, 2)])
+def test_squeeze_axes_arg(axes):
+    q = get_queue_or_skip()
+
+    Xnp = np.array([[[1], [2], [3]]], dtype="u1")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.squeeze(X, axes)
+    Ynp = Xnp.squeeze(axes)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("axes", [1, -2, (1), (-2), (0, 0), (1, 1)])
+def test_squeeze_axes_arg_error(axes):
+    q = get_queue_or_skip()
+
+    Xnp = np.array([[[1], [2], [3]]], dtype="u1")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    pytest.raises(ValueError, dpt.squeeze, X, axes)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [np.array(0, dtype="u1"), (0,)],
+        [np.array(0, dtype="u1"), (1,)],
+        [np.array(0, dtype="u1"), (3,)],
+        [np.ones(1, dtype="u1"), (1,)],
+        [np.ones(1, dtype="u1"), (2,)],
+        [np.ones(1, dtype="u1"), (1, 2, 3)],
+        [np.arange(3, dtype="u1"), (3,)],
+        [np.arange(3, dtype="u1"), (1, 3)],
+        [np.arange(3, dtype="u1"), (2, 3)],
+        [np.ones(0, dtype="u1"), 0],
+        [np.ones(1, dtype="u1"), 1],
+        [np.ones(1, dtype="u1"), 2],
+        [np.ones(1, dtype="u1"), (0,)],
+        [np.ones((1, 2), dtype="u1"), (0, 2)],
+        [np.ones((2, 1), dtype="u1"), (2, 0)],
+    ],
+)
+def test_broadcast_to_succeeds(data):
+    q = get_queue_or_skip()
+
+    Xnp, target_shape = data
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.broadcast_to(X, target_shape)
+    Ynp = np.broadcast_to(Xnp, target_shape)
+    assert_array_equal(dpt.asnumpy(Y), Ynp)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(0,), ()],
+        [(1,), ()],
+        [(3,), ()],
+        [(3,), (1,)],
+        [(3,), (2,)],
+        [(3,), (4,)],
+        [(1, 2), (2, 1)],
+        [(1, 1), (1,)],
+        [(1,), -1],
+        [(1,), (-1,)],
+        [(1, 2), (-1, 2)],
+    ],
+)
+def test_broadcast_to_raises(data):
+    q = get_queue_or_skip()
+
+    orig_shape, target_shape = data
+    Xnp = np.zeros(orig_shape, dtype="i1")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    pytest.raises(ValueError, dpt.broadcast_to, X, target_shape)
+
+
+def assert_broadcast_correct(input_shapes):
+    q = get_queue_or_skip()
+    np_arrays = [np.zeros(s, dtype="i1") for s in input_shapes]
+    out_np_arrays = np.broadcast_arrays(*np_arrays)
+    usm_arrays = [dpt.asarray(Xnp, sycl_queue=q) for Xnp in np_arrays]
+    out_usm_arrays = dpt.broadcast_arrays(*usm_arrays)
+    for Xnp, X in zip(out_np_arrays, out_usm_arrays):
+        assert_array_equal(
+            Xnp, dpt.asnumpy(X), err_msg=f"Failed for {input_shapes})"
+        )
+
+
+def assert_broadcast_arrays_raise(input_shapes):
+    q = get_queue_or_skip()
+    usm_arrays = [dpt.asarray(np.zeros(s), sycl_queue=q) for s in input_shapes]
+    pytest.raises(ValueError, dpt.broadcast_arrays, *usm_arrays)
+
+
+def test_broadcast_arrays_same():
+    q = get_queue_or_skip()
+    Xnp = np.arange(10)
+    Ynp = np.arange(10)
+    res_Xnp, res_Ynp = np.broadcast_arrays(Xnp, Ynp)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+    res_X, res_Y = dpt.broadcast_arrays(X, Y)
+    assert_array_equal(res_Xnp, dpt.asnumpy(res_X))
+    assert_array_equal(res_Ynp, dpt.asnumpy(res_Y))
+
+
+def test_broadcast_arrays_one_off():
+    q = get_queue_or_skip()
+    Xnp = np.array([[1, 2, 3]])
+    Ynp = np.array([[1], [2], [3]])
+    res_Xnp, res_Ynp = np.broadcast_arrays(Xnp, Ynp)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+    res_X, res_Y = dpt.broadcast_arrays(X, Y)
+    assert_array_equal(res_Xnp, dpt.asnumpy(res_X))
+    assert_array_equal(res_Ynp, dpt.asnumpy(res_Y))
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        (),
+        (1,),
+        (3,),
+        (0, 1),
+        (0, 3),
+        (1, 0),
+        (3, 0),
+        (1, 3),
+        (3, 1),
+        (3, 3),
+    ],
+)
+def test_broadcast_arrays_same_shapes(shapes):
+    for shape in shapes:
+        single_input_shapes = [shape]
+        assert_broadcast_correct(single_input_shapes)
+        double_input_shapes = [shape, shape]
+        assert_broadcast_correct(double_input_shapes)
+        triple_input_shapes = [shape, shape, shape]
+        assert_broadcast_correct(triple_input_shapes)
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        [[(1,), (3,)]],
+        [[(1, 3), (3, 3)]],
+        [[(3, 1), (3, 3)]],
+        [[(1, 3), (3, 1)]],
+        [[(1, 1), (3, 3)]],
+        [[(1, 1), (1, 3)]],
+        [[(1, 1), (3, 1)]],
+        [[(1, 0), (0, 0)]],
+        [[(0, 1), (0, 0)]],
+        [[(1, 0), (0, 1)]],
+        [[(1, 1), (0, 0)]],
+        [[(1, 1), (1, 0)]],
+        [[(1, 1), (0, 1)]],
+    ],
+)
+def test_broadcast_arrays_same_len_shapes(shapes):
+    # Check that two different input shapes of the same length, but some have
+    # ones, broadcast to the correct shape.
+
+    for input_shapes in shapes:
+        assert_broadcast_correct(input_shapes)
+        assert_broadcast_correct(input_shapes[::-1])
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        [[(), (3,)]],
+        [[(3,), (3, 3)]],
+        [[(3,), (3, 1)]],
+        [[(1,), (3, 3)]],
+        [[(), (3, 3)]],
+        [[(1, 1), (3,)]],
+        [[(1,), (3, 1)]],
+        [[(1,), (1, 3)]],
+        [[(), (1, 3)]],
+        [[(), (3, 1)]],
+        [[(), (0,)]],
+        [[(0,), (0, 0)]],
+        [[(0,), (0, 1)]],
+        [[(1,), (0, 0)]],
+        [[(), (0, 0)]],
+        [[(1, 1), (0,)]],
+        [[(1,), (0, 1)]],
+        [[(1,), (1, 0)]],
+        [[(), (1, 0)]],
+        [[(), (0, 1)]],
+    ],
+)
+def test_broadcast_arrays_different_len_shapes(shapes):
+    # Check that two different input shapes (of different lengths) broadcast
+    # to the correct shape.
+
+    for input_shapes in shapes:
+        assert_broadcast_correct(input_shapes)
+        assert_broadcast_correct(input_shapes[::-1])
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        [[(3,), (4,)]],
+        [[(2, 3), (2,)]],
+        [[(3,), (3,), (4,)]],
+        [[(1, 3, 4), (2, 3, 3)]],
+    ],
+)
+def test_incompatible_shapes_raise_valueerror(shapes):
+    for input_shapes in shapes:
+        assert_broadcast_arrays_raise(input_shapes)
+        assert_broadcast_arrays_raise(input_shapes[::-1])
+
+
+def test_broadcast_arrays_no_args():
+    with pytest.raises(ValueError):
+        dpt.broadcast_arrays()
+
+
+def test_flip_axis_incorrect():
+    q = get_queue_or_skip()
+
+    X_np = np.ones((4, 4))
+    X = dpt.asarray(X_np, sycl_queue=q)
+
+    pytest.raises(AxisError, dpt.flip, dpt.asarray(np.ones(4)), axis=1)
+    pytest.raises(AxisError, dpt.flip, X, axis=2)
+    pytest.raises(AxisError, dpt.flip, X, axis=-3)
+    pytest.raises(AxisError, dpt.flip, X, axis=(0, 3))
+
+
+def test_flip_0d():
+    q = get_queue_or_skip()
+
+    Xnp = np.array(1, dtype="int64")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Ynp = np.flip(Xnp)
+    Y = dpt.flip(X)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    pytest.raises(AxisError, dpt.flip, X, axis=0)
+    pytest.raises(AxisError, dpt.flip, X, axis=1)
+    pytest.raises(AxisError, dpt.flip, X, axis=-1)
+
+
+def test_flip_1d():
+    q = get_queue_or_skip()
+
+    Xnp = np.arange(6)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    for ax in range(-X.ndim, X.ndim):
+        Ynp = np.flip(Xnp, axis=ax)
+        Y = dpt.flip(X, axis=ax)
+        assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Ynp = np.flip(Xnp, axis=0)
+    Y = dpt.flip(X, axis=0)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        (3, 2),
+        (2, 3),
+        (2, 2),
+        (3, 3),
+        (3, 2, 3),
+        (2, 3, 2),
+        (2, 2, 2),
+        (3, 3, 3),
+    ],
+)
+def test_flip_2d_3d(shapes):
+    q = get_queue_or_skip()
+
+    Xnp_size = np.prod(shapes)
+    Xnp = np.arange(Xnp_size).reshape(shapes)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    for ax in range(-X.ndim, X.ndim):
+        Y = dpt.flip(X, axis=ax)
+        Ynp = np.flip(Xnp, axis=ax)
+        assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        (1,),
+        (3,),
+        (2, 3),
+        (3, 2),
+        (2, 2),
+        (1, 2, 3),
+        (2, 1, 3),
+        (2, 3, 1),
+        (3, 2, 1),
+        (3, 3, 3),
+    ],
+)
+def test_flip_default_axes(shapes):
+    q = get_queue_or_skip()
+
+    Xnp_size = np.prod(shapes)
+    Xnp = np.arange(Xnp_size).reshape(shapes)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.flip(X)
+    Ynp = np.flip(Xnp)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize(
+    "shapes",
+    [
+        (0),
+        (1),
+        (1, 1),
+        (1, 0),
+        (0, 1),
+        (1, 1, 1),
+        (1, 0, 1),
+        (0, 1, 0),
+    ],
+)
+def test_flip_empty_0_size_dim(shapes):
+    q = get_queue_or_skip()
+
+    X = dpt.empty(shapes, sycl_queue=q)
+    dpt.flip(X)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(2, 3), (0, 1)],
+        [(2, 3), (1, 0)],
+        [(2, 3), ()],
+        [(2, 1, 3), (0, 2)],
+        [(3, 1, 2), (2, 0)],
+        [(3, 3, 3), (2,)],
+        [(1, 2, 3), [0, -2]],
+        [(3, 1, 2), [-1, 0]],
+        [(3, 3, 3), [-2, -1]],
+    ],
+)
+def test_flip_multiple_axes(data):
+    q = get_queue_or_skip()
+
+    shape, axes = data
+    Xnp_size = np.prod(shape)
+    Xnp = np.arange(Xnp_size).reshape(shape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    Y = dpt.flip(X, axis=axes)
+    Ynp = np.flip(Xnp, axis=axes)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_roll_scalar():
+    q = get_queue_or_skip()
+
+    Xnp = np.ones([], dtype="f4")
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Y = dpt.roll(X, 1)
+    Ynp = np.roll(Xnp, 1)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+    with pytest.raises(AxisError):
+        dpt.roll(X, 1, axis=0)
+    with pytest.raises(AxisError):
+        dpt.roll(X, 1, axis=1)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [2, None],
+        [-2, None],
+        [2, 0],
+        [-2, 0],
+        [2, ()],
+        [11, 0],
+    ],
+)
+def test_roll_1d(data):
+    q = get_queue_or_skip()
+
+    Xnp = np.arange(10)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    sh, ax = data
+
+    Y = dpt.roll(X, sh, axis=ax)
+    Ynp = np.roll(Xnp, sh, axis=ax)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Y = dpt.roll(X, sh, axis=ax)
+    Ynp = np.roll(Xnp, sh, axis=ax)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, None],
+        [1, 0],
+        [1, 1],
+        [1, ()],
+        # Roll multiple axes at once
+        [1, (0, 1)],
+        [(1, 0), (0, 1)],
+        [(-1, 0), (1, 0)],
+        [(0, 1), (0, 1)],
+        [(0, -1), (0, 1)],
+        [(1, 1), (0, 1)],
+        [(-1, -1), (0, 1)],
+        # Roll the same axis multiple times.
+        [1, (0, 0)],
+        [1, (1, 1)],
+        # Roll more than one turn in either direction.
+        [6, 1],
+        [-4, 1],
+    ],
+)
+def test_roll_2d(data):
+    q = get_queue_or_skip()
+
+    Xnp = np.arange(10).reshape(2, 5)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+    sh, ax = data
+
+    Y = dpt.roll(X, sh, axis=ax)
+    Ynp = np.roll(Xnp, sh, axis=ax)
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_roll_out_bounds_shifts():
+    "See gh-1857"
+    get_queue_or_skip()
+
+    x = dpt.arange(4)
+    y = dpt.roll(x, np.uint64(2**63 + 2))
+    expected = dpt.roll(x, 2)
+    assert dpt.all(y == expected)
+
+    x_empty = x[1:1]
+    y = dpt.roll(x_empty, 11)
+    assert y.size == 0
+
+    x_2d = dpt.reshape(x, (2, 2))
+    y = dpt.roll(x_2d, np.uint64(2**63 + 1), axis=1)
+    expected = dpt.roll(x_2d, 1, axis=1)
+    assert dpt.all(y == expected)
+
+    x_2d_empty = x_2d[:, 1:1]
+    y = dpt.roll(x_2d_empty, 3, axis=1)
+    expected = dpt.empty_like(x_2d_empty)
+    assert dpt.all(y == expected)
+
+
+def test_roll_validation():
+    get_queue_or_skip()
+
+    X = {}
+    with pytest.raises(TypeError):
+        dpt.roll(X)
+
+    X = dpt.empty((1, 2, 3))
+    shift = ((2, 3, 1), (1, 2, 3))
+    with pytest.raises(ValueError):
+        dpt.roll(X, shift=shift, axis=(0, 1, 2))
+
+
+def test_concat_incorrect_type():
+    Xnp = np.ones((2, 2))
+    with pytest.raises(TypeError):
+        dpt.concat()
+    with pytest.raises(TypeError):
+        dpt.concat([])
+    with pytest.raises(TypeError):
+        dpt.concat(Xnp)
+    with pytest.raises(TypeError):
+        dpt.concat([Xnp, Xnp])
+
+
+def test_concat_incorrect_queue():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    X = dpt.ones((2, 2), sycl_queue=q1)
+    Y = dpt.ones((2, 2), sycl_queue=q2)
+
+    pytest.raises(ValueError, dpt.concat, [X, Y])
+
+
+def test_concat_different_dtype():
+    q = get_queue_or_skip()
+
+    X = dpt.ones((2, 2), dtype=np.int64, sycl_queue=q)
+    Y = dpt.ones((3, 2), dtype=np.uint32, sycl_queue=q)
+
+    XY = dpt.concat([X, Y])
+
+    assert XY.dtype is X.dtype
+    assert XY.shape == (5, 2)
+    assert XY.sycl_queue == q
+
+    X1 = dpt.arange(10, dtype="i2", sycl_queue=q)
+    Y1 = dpt.arange(5, dtype="i4", sycl_queue=q)
+
+    XY1 = dpt.concat([X1[::2], Y1[::-1]], axis=None)
+    assert XY1.shape == (10,)
+    assert XY1.sycl_queue == q
+    assert XY1.dtype == Y1.dtype
+
+
+def test_concat_incorrect_ndim():
+    q = get_queue_or_skip()
+
+    X = dpt.ones((2, 2), sycl_queue=q)
+    Y = dpt.ones((2, 2, 2), sycl_queue=q)
+
+    pytest.raises(ValueError, dpt.concat, [X, Y])
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(2, 2), (3, 3), 0],
+        [(2, 2), (3, 3), 1],
+        [(3, 2), (3, 3), 0],
+        [(2, 3), (3, 3), 1],
+    ],
+)
+def test_concat_incorrect_shape(data):
+    q = get_queue_or_skip()
+
+    Xshape, Yshape, axis = data
+
+    X = dpt.ones(Xshape, sycl_queue=q)
+    Y = dpt.ones(Yshape, sycl_queue=q)
+
+    pytest.raises(ValueError, dpt.concat, [X, Y], axis=axis)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(6,), 0],
+        [(2, 3), 1],
+        [(3, 2), -1],
+        [(1, 6), 0],
+        [(2, 1, 3), 2],
+    ],
+)
+def test_concat_1array(data):
+    q = get_queue_or_skip()
+
+    Xshape, axis = data
+
+    Xnp = np.arange(6).reshape(Xshape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.concatenate([Xnp], axis=axis)
+    Y = dpt.concat([X], axis=axis)
+
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Ynp = np.concatenate((Xnp,), axis=axis)
+    Y = dpt.concat((X,), axis=axis)
+
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(1,), (1,), 0],
+        [(0, 2), (0, 2), 1],
+        [(0, 2), (2, 2), 0],
+        [(2, 1), (2, 2), -1],
+        [(2, 2, 2), (2, 1, 2), 1],
+        [(3, 3, 3), (2, 2), None],
+    ],
+)
+def test_concat_2arrays(data):
+    q = get_queue_or_skip()
+
+    Xshape, Yshape, axis = data
+
+    Xnp = np.ones(Xshape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.zeros(Yshape)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+
+    Znp = np.concatenate([Xnp, Ynp], axis=axis)
+    Z = dpt.concat([X, Y], axis=axis)
+
+    assert_array_equal(Znp, dpt.asnumpy(Z))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(1,), (1,), (1,), 0],
+        [(0, 2), (2, 2), (1, 2), 0],
+        [(2, 1, 2), (2, 2, 2), (2, 4, 2), 1],
+    ],
+)
+def test_concat_3arrays(data):
+    q = get_queue_or_skip()
+
+    Xshape, Yshape, Zshape, axis = data
+
+    Xnp = np.ones(Xshape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.zeros(Yshape)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+
+    Znp = np.full(Zshape, 2.0)
+    Z = dpt.asarray(Znp, sycl_queue=q)
+
+    Rnp = np.concatenate([Xnp, Ynp, Znp], axis=axis)
+    R = dpt.concat([X, Y, Z], axis=axis)
+
+    assert_array_equal(Rnp, dpt.asnumpy(R))
+
+
+def test_concat_axis_none_strides():
+    q = get_queue_or_skip()
+    Xnp = np.arange(0, 18).reshape((6, 3))
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.arange(20, 36).reshape((4, 2, 2))
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+
+    Znp = np.concatenate([Xnp[::2], Ynp[::2]], axis=None)
+    Z = dpt.concat([X[::2], Y[::2]], axis=None)
+
+    assert_array_equal(Znp, dpt.asnumpy(Z))
+
+
+def test_stack_incorrect_shape():
+    q = get_queue_or_skip()
+
+    X = dpt.ones((1,), sycl_queue=q)
+    Y = dpt.ones((2,), sycl_queue=q)
+
+    with pytest.raises(ValueError):
+        dpt.stack([X, Y], axis=0)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(6,), 0],
+        [(2, 3), 1],
+        [(3, 2), -1],
+        [(1, 6), 2],
+        [(2, 1, 3), 2],
+    ],
+)
+def test_stack_1array(data):
+    q = get_queue_or_skip()
+
+    shape, axis = data
+
+    Xnp = np.arange(6).reshape(shape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.stack([Xnp], axis=axis)
+    Y = dpt.stack([X], axis=axis)
+
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+    Ynp = np.stack((Xnp,), axis=axis)
+    Y = dpt.stack((X,), axis=axis)
+
+    assert_array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(1,), 0],
+        [(0, 2), 0],
+        [(2, 0), 0],
+        [(2, 3), 0],
+        [(2, 3), 1],
+        [(2, 3), 2],
+        [(2, 3), -1],
+        [(2, 3), -2],
+        [(2, 2, 2), 1],
+    ],
+)
+def test_stack_2arrays(data):
+    q = get_queue_or_skip()
+
+    shape, axis = data
+
+    Xnp = np.ones(shape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.zeros(shape)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+
+    Znp = np.stack([Xnp, Ynp], axis=axis)
+    Z = dpt.stack([X, Y], axis=axis)
+
+    assert_array_equal(Znp, dpt.asnumpy(Z))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [(1,), 0],
+        [(0, 2), 0],
+        [(2, 1, 2), 1],
+    ],
+)
+def test_stack_3arrays(data):
+    q = get_queue_or_skip()
+
+    shape, axis = data
+
+    Xnp = np.ones(shape)
+    X = dpt.asarray(Xnp, sycl_queue=q)
+
+    Ynp = np.zeros(shape)
+    Y = dpt.asarray(Ynp, sycl_queue=q)
+
+    Znp = np.full(shape, 2.0)
+    Z = dpt.asarray(Znp, sycl_queue=q)
+
+    Rnp = np.stack([Xnp, Ynp, Znp], axis=axis)
+    R = dpt.stack([X, Y, Z], axis=axis)
+
+    assert_array_equal(Rnp, dpt.asnumpy(R))
+
+
+def test_can_cast():
+    q = get_queue_or_skip()
+
+    # incorrect input
+    X = dpt.ones((2, 2), dtype=dpt.int16, sycl_queue=q)
+    pytest.raises(TypeError, dpt.can_cast, X, 1)
+    pytest.raises(TypeError, dpt.can_cast, X, X)
+    X_np = np.ones((2, 2), dtype=np.int16)
+
+    assert dpt.can_cast(X, "float32") == np.can_cast(X_np, "float32")
+    assert dpt.can_cast(X, dpt.int32) == np.can_cast(X_np, np.int32)
+    assert dpt.can_cast(X, dpt.int64) == np.can_cast(X_np, np.int64)
+
+
+def test_result_type():
+    q = get_queue_or_skip()
+
+    usm_ar = dpt.ones((2), dtype=dpt.int16, sycl_queue=q)
+    np_ar = dpt.asnumpy(usm_ar)
+
+    X = [usm_ar, dpt.int32, "int64", usm_ar]
+    X_np = [np_ar, np.int32, "int64", np_ar]
+
+    assert dpt.result_type(*X) == np.result_type(*X_np)
+
+    X = [usm_ar, dpt.int32, "int64", True]
+    X_np = [np_ar, np.int32, "int64", True]
+
+    assert dpt.result_type(*X) == np.result_type(*X_np)
+
+    X = [usm_ar, dpt.int32, "int64", 2]
+    X_np = [np_ar, np.int32, "int64", 2]
+
+    assert dpt.result_type(*X) == np.result_type(*X_np)
+
+    X = [dpt.int32, "int64", 2]
+    X_np = [np.int32, "int64", 2]
+
+    assert dpt.result_type(*X) == np.result_type(*X_np)
+
+    X = [usm_ar, dpt.int32, "int64", 2.0]
+    X_np = [np_ar, np.int32, "int64", 2.0]
+
+    assert dpt.result_type(*X).kind == np.result_type(*X_np).kind
+
+    X = [usm_ar, dpt.int32, "int64", 2.0 + 1j]
+    X_np = [np_ar, np.int32, "int64", 2.0 + 1j]
+
+    assert dpt.result_type(*X).kind == np.result_type(*X_np).kind
+
+
+def test_swapaxes_1d():
+    get_queue_or_skip()
+    x = np.array([[1, 2, 3]])
+    exp = np.swapaxes(x, 0, 1)
+
+    y = dpt.asarray([[1, 2, 3]])
+    res = dpt.swapaxes(y, 0, 1)
+
+    assert_array_equal(exp, dpt.asnumpy(res))
+
+
+def test_swapaxes_2d():
+    get_queue_or_skip()
+    x = np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]])
+    exp = np.swapaxes(x, 0, 2)
+
+    y = dpt.asarray([[[0, 1], [2, 3]], [[4, 5], [6, 7]]])
+    res = dpt.swapaxes(y, 0, 2)
+
+    assert_array_equal(exp, dpt.asnumpy(res))
+
+
+@pytest.mark.parametrize(
+    "source, expected",
+    [
+        (0, (6, 7, 5)),
+        (1, (5, 7, 6)),
+        (2, (5, 6, 7)),
+        (-1, (5, 6, 7)),
+    ],
+)
+def test_moveaxis_move_to_end(source, expected):
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(5 * 6 * 7), (5, 6, 7))
+    actual = dpt.moveaxis(x, source, -1).shape
+    assert_(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "source, destination, expected",
+    [
+        (0, 1, (2, 1, 3, 4)),
+        (1, 2, (1, 3, 2, 4)),
+        (1, -1, (1, 3, 4, 2)),
+    ],
+)
+def test_moveaxis_new_position(source, destination, expected):
+    get_queue_or_skip()
+    x = dpt.reshape(dpt.arange(24), (1, 2, 3, 4))
+    actual = dpt.moveaxis(x, source, destination).shape
+    assert_(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "source, destination",
+    [
+        (0, 0),
+        (3, -1),
+        (-1, 3),
+        ([0, -1], [0, -1]),
+        ([2, 0], [2, 0]),
+    ],
+)
+def test_moveaxis_preserve_order(source, destination):
+    get_queue_or_skip()
+    x = dpt.zeros((1, 2, 3, 4))
+    actual = dpt.moveaxis(x, source, destination).shape
+    assert_(actual, (1, 2, 3, 4))
+
+
+@pytest.mark.parametrize(
+    "shape, source, destination, expected",
+    [
+        ((0, 1, 2, 3), [0, 1], [2, 3], (2, 3, 0, 1)),
+        ((0, 1, 2, 3), [2, 3], [0, 1], (2, 3, 0, 1)),
+        ((0, 1, 2, 3), [0, 1, 2], [2, 3, 0], (2, 3, 0, 1)),
+        ((0, 1, 2, 3), [3, 0], [1, 0], (0, 3, 1, 2)),
+        ((0, 1, 2, 3), [0, 3], [0, 1], (0, 3, 1, 2)),
+        ((1, 2, 3, 4), range(4), range(4), (1, 2, 3, 4)),
+    ],
+)
+def test_moveaxis_move_multiples(shape, source, destination, expected):
+    get_queue_or_skip()
+    x = dpt.zeros(shape)
+    y = dpt.moveaxis(x, source, destination)
+    actual = y.shape
+    assert_(actual, expected)
+    assert y._pointer == x._pointer
+
+
+def test_moveaxis_errors():
+    try:
+        x_flat = dpt.arange(6)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    x = dpt.reshape(x_flat, (1, 2, 3))
+    assert_raises_regex(
+        AxisError, "source.*out of bounds", dpt.moveaxis, x, 3, 0
+    )
+    assert_raises_regex(
+        AxisError, "source.*out of bounds", dpt.moveaxis, x, -4, 0
+    )
+    assert_raises_regex(
+        AxisError, "destination.*out of bounds", dpt.moveaxis, x, 0, 5
+    )
+    assert_raises_regex(
+        ValueError, "repeated axis in `source`", dpt.moveaxis, x, [0, 0], [0, 1]
+    )
+    assert_raises_regex(
+        ValueError,
+        "repeated axis in `destination`",
+        dpt.moveaxis,
+        x,
+        [0, 1],
+        [1, 1],
+    )
+    assert_raises_regex(
+        ValueError, "must have the same number", dpt.moveaxis, x, 0, [0, 1]
+    )
+    assert_raises_regex(
+        ValueError, "must have the same number", dpt.moveaxis, x, [0, 1], [0]
+    )
+
+
+def test_unstack_axis0():
+    try:
+        x_flat = dpt.arange(6)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    y = dpt.reshape(x_flat, (2, 3))
+    res = dpt.unstack(y)
+
+    assert_array_equal(dpt.asnumpy(y[0, ...]), dpt.asnumpy(res[0]))
+    assert_array_equal(dpt.asnumpy(y[1, ...]), dpt.asnumpy(res[1]))
+
+
+def test_unstack_axis1():
+    try:
+        x_flat = dpt.arange(6)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    y = dpt.reshape(x_flat, (2, 3))
+    res = dpt.unstack(y, axis=1)
+
+    assert_array_equal(dpt.asnumpy(y[:, 0, ...]), dpt.asnumpy(res[0]))
+    assert_array_equal(dpt.asnumpy(y[:, 1, ...]), dpt.asnumpy(res[1]))
+    assert_array_equal(dpt.asnumpy(y[:, 2, ...]), dpt.asnumpy(res[2]))
+
+
+def test_unstack_axis2():
+    try:
+        x_flat = dpt.arange(60)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    y = dpt.reshape(x_flat, (4, 5, 3))
+    res = dpt.unstack(y, axis=2)
+
+    assert_array_equal(dpt.asnumpy(y[:, :, 0, ...]), dpt.asnumpy(res[0]))
+    assert_array_equal(dpt.asnumpy(y[:, :, 1, ...]), dpt.asnumpy(res[1]))
+    assert_array_equal(dpt.asnumpy(y[:, :, 2, ...]), dpt.asnumpy(res[2]))
+
+
+def test_finfo_object():
+    fi = dpt.finfo(dpt.float32)
+    assert isinstance(fi.bits, int)
+    assert isinstance(fi.max, float)
+    assert isinstance(fi.min, float)
+    assert isinstance(fi.eps, float)
+    assert isinstance(fi.epsneg, float)
+    assert isinstance(fi.smallest_normal, float)
+    assert isinstance(fi.tiny, float)
+    assert isinstance(fi.precision, float)
+    assert isinstance(fi.resolution, float)
+    assert isinstance(fi.dtype, dpt.dtype)
+    assert isinstance(str(fi), str)
+    assert isinstance(repr(fi), str)
+
+
+def test_repeat_scalar_sequence_agreement():
+    get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4")
+    expected_res = dpt.empty(10, dtype="i4")
+    expected_res[1::2], expected_res[::2] = x, x
+
+    # scalar case
+    reps = 2
+    res = dpt.repeat(x, reps)
+    assert dpt.all(res == expected_res)
+
+    # tuple
+    reps = (2, 2, 2, 2, 2)
+    res = dpt.repeat(x, reps)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_as_broadcasting():
+    get_queue_or_skip()
+
+    reps = 5
+    x = dpt.arange(reps, dtype="i4")
+    x1 = x[:, dpt.newaxis]
+    expected_res = dpt.broadcast_to(x1, (reps, reps))
+
+    res = dpt.repeat(x1, reps, axis=1)
+    assert dpt.all(res == expected_res)
+
+    x2 = x[dpt.newaxis, :]
+    expected_res = dpt.broadcast_to(x2, (reps, reps))
+
+    res = dpt.repeat(x2, reps, axis=0)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_axes():
+    get_queue_or_skip()
+
+    reps = 2
+    x = dpt.reshape(dpt.arange(5 * 10, dtype="i4"), (5, 10))
+    expected_res = dpt.empty((x.shape[0] * 2, x.shape[1]), dtype=x.dtype)
+    expected_res[::2, :], expected_res[1::2] = x, x
+    res = dpt.repeat(x, reps, axis=0)
+    assert dpt.all(res == expected_res)
+
+    expected_res = dpt.empty((x.shape[0], x.shape[1] * 2), dtype=x.dtype)
+    expected_res[:, ::2], expected_res[:, 1::2] = x, x
+    res = dpt.repeat(x, reps, axis=1)
+    assert dpt.all(res == expected_res)
+
+    x = dpt.arange(10, dtype="i4")
+    expected_res = dpt.empty(x.shape[0] * reps, dtype=x.dtype)
+    expected_res[::2], expected_res[1::2] = x, x
+    res = dpt.repeat(x, reps, axis=0)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_size_0_outputs():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 0, 5), dtype="i4")
+    reps = 10
+    res = dpt.repeat(x, reps, axis=0)
+    assert res.size == 0
+    assert res.shape == (30, 0, 5)
+
+    res = dpt.repeat(x, reps, axis=1)
+    assert res.size == 0
+    assert res.shape == (3, 0, 5)
+
+    res = dpt.repeat(x, (2, 2, 2), axis=0)
+    assert res.size == 0
+    assert res.shape == (6, 0, 5)
+
+    x = dpt.ones((3, 2, 5))
+    res = dpt.repeat(x, 0, axis=1)
+    assert res.size == 0
+    assert res.shape == (3, 0, 5)
+
+    res = dpt.repeat(x, (0, 0), axis=1)
+    assert res.size == 0
+    assert res.shape == (3, 0, 5)
+
+    # axis=None cases
+    res = dpt.repeat(x, 0)
+    assert res.size == 0
+
+    res = dpt.repeat(x, (0,) * x.size)
+    assert res.size == 0
+
+
+def test_repeat_strides():
+    get_queue_or_skip()
+
+    reps = 2
+    x = dpt.reshape(dpt.arange(10 * 10, dtype="i4"), (10, 10))
+    x1 = x[:, ::-2]
+    expected_res = dpt.empty((10, 10), dtype="i4")
+    expected_res[:, ::2], expected_res[:, 1::2] = x1, x1
+    res = dpt.repeat(x1, reps, axis=1)
+    assert dpt.all(res == expected_res)
+    res = dpt.repeat(x1, (reps,) * x1.shape[1], axis=1)
+    assert dpt.all(res == expected_res)
+
+    x1 = x[::-2, :]
+    expected_res = dpt.empty((10, 10), dtype="i4")
+    expected_res[::2, :], expected_res[1::2, :] = x1, x1
+    res = dpt.repeat(x1, reps, axis=0)
+    assert dpt.all(res == expected_res)
+    res = dpt.repeat(x1, (reps,) * x1.shape[0], axis=0)
+    assert dpt.all(res == expected_res)
+
+    # axis=None
+    x = dpt.reshape(dpt.arange(10 * 10), (10, 10))
+    x1 = dpt.reshape(x[::-2, :], -1)
+    x2 = x[::-2, :]
+    expected_res = dpt.empty(10 * 10, dtype="i4")
+    expected_res[::2], expected_res[1::2] = x1, x1
+    res = dpt.repeat(x2, reps)
+    assert dpt.all(res == expected_res)
+    res = dpt.repeat(x2, (reps,) * x1.size)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_casting():
+    get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4")
+    # i4 is cast to i8
+    reps = dpt.ones(5, dtype="i4")
+    res = dpt.repeat(x, reps)
+    assert res.shape == x.shape
+    assert dpt.all(res == x)
+
+
+def test_repeat_strided_repeats():
+    get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4")
+    reps = dpt.ones(10, dtype="i8")
+    reps[::2] = 0
+    reps = reps[::-2]
+    res = dpt.repeat(x, reps)
+    assert res.shape == x.shape
+    assert dpt.all(res == x)
+
+
+def test_repeat_size1_repeats():
+    get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4")
+    expected_res = dpt.repeat(x, 2)
+    # 0D repeats
+    reps_0d = dpt.asarray(2, dtype="i8")
+    res = dpt.repeat(x, reps_0d)
+    assert dpt.all(res == expected_res)
+    # 1D repeats
+    reps_1d = dpt.asarray([2], dtype="i8")
+    res = dpt.repeat(x, reps_1d)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_arg_validation():
+    get_queue_or_skip()
+
+    x = {}
+    with pytest.raises(TypeError):
+        dpt.repeat(x, 2)
+
+    # axis must be 0 for scalar
+    x = dpt.empty(())
+    with pytest.raises(ValueError):
+        dpt.repeat(x, 2, axis=1)
+
+    # repeats must be positive
+    x = dpt.empty(5)
+    with pytest.raises(ValueError):
+        dpt.repeat(x, -2)
+
+    # repeats must be integers
+    with pytest.raises(TypeError):
+        dpt.repeat(x, 2.0)
+
+    # repeats tuple must be the same length as axis
+    with pytest.raises(ValueError):
+        dpt.repeat(x, (1, 2))
+
+    # repeats tuple elements must be positive
+    with pytest.raises(ValueError):
+        dpt.repeat(x, (-1,))
+
+    # repeats must be int or tuple
+    with pytest.raises(TypeError):
+        dpt.repeat(x, dict())
+
+    # repeats array must be 0d or 1d
+    with pytest.raises(ValueError):
+        dpt.repeat(x, dpt.ones((1, 1), dtype="i8"))
+
+    # repeats must be castable to i8
+    with pytest.raises(TypeError):
+        dpt.repeat(x, dpt.asarray(2.0, dtype="f4"))
+
+    # compute follows data
+    q2 = dpctl.SyclQueue()
+    reps = dpt.asarray(1, dtype="i8", sycl_queue=q2)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.repeat(x, reps)
+
+    # repeats array must not contain negative elements
+    reps = dpt.asarray(-1, dtype="i8")
+    with pytest.raises(ValueError):
+        dpt.repeat(x, reps)
+    reps = dpt.asarray([1, 1, 1, 1, -1], dtype="i8")
+    with pytest.raises(ValueError):
+        dpt.repeat(x, reps)
+
+    # repeats must broadcastable to axis size
+    reps = dpt.arange(10, dtype="i8")
+    with pytest.raises(ValueError):
+        dpt.repeat(x, reps)
+
+
+def test_tile_basic():
+    get_queue_or_skip()
+
+    reps = 2
+    x = dpt.arange(5, dtype="i4")
+    res = dpt.tile(x, reps)
+    assert res.shape == (x.shape[0] * reps,)
+    assert dpt.all(res[: x.size] == res[x.size :])
+
+    reps = (2, 1)
+    expected_sh = (2, x.shape[0])
+    expected_res = dpt.broadcast_to(x, expected_sh)
+    res = dpt.tile(x, reps)
+    assert res.shape == expected_sh
+    assert dpt.all(expected_res == res)
+
+
+def test_tile_size_1():
+    get_queue_or_skip()
+
+    reps = 5
+    # test for 0d array
+    x1 = dpt.asarray(2, dtype="i4")
+    res = dpt.tile(x1, reps)
+    assert dpt.all(res == dpt.full(reps, 2, dtype="i4"))
+
+    # test for 1d array with single element
+    x2 = dpt.asarray([2], dtype="i4")
+    res = dpt.tile(x2, reps)
+    assert dpt.all(res == dpt.full(reps, 2, dtype="i4"))
+
+    reps = ()
+    # test for gh-1627 behavior
+    res = dpt.tile(x1, reps)
+    assert x1.shape == res.shape
+    assert_array_equal(dpt.asnumpy(x1), dpt.asnumpy(res))
+
+    res = dpt.tile(x2, reps)
+    assert x2.shape == res.shape
+    assert_array_equal(dpt.asnumpy(x2), dpt.asnumpy(res))
+
+
+def test_tile_prepends_axes():
+    get_queue_or_skip()
+
+    reps = (2,)
+    x = dpt.ones((5, 10), dtype="i4")
+    expected_res = dpt.ones((5, 20), dtype="i4")
+    res = dpt.tile(x, reps)
+    assert dpt.all(res == expected_res)
+
+    reps = (3, 2, 2)
+    expected_res = dpt.ones((3, 10, 20), dtype="i4")
+    res = dpt.tile(x, reps)
+    assert dpt.all(res == expected_res)
+
+
+def test_tile_empty_outputs():
+    get_queue_or_skip()
+
+    x = dpt.asarray((), dtype="i4")
+    reps = 10
+    res = dpt.tile(x, reps)
+    assert res.size == 0
+    assert res.shape == (0,)
+
+    x = dpt.ones((3, 0, 5), dtype="i4")
+    res = dpt.tile(x, reps)
+    assert res.size == 0
+    assert res.shape == (3, 0, 50)
+
+    reps = (2, 1, 2)
+    res = dpt.tile(x, reps)
+    assert res.size == 0
+    assert res.shape == (6, 0, 10)
+
+    x = dpt.ones((2, 3, 4), dtype="i4")
+    reps = (0, 1, 1)
+    res = dpt.tile(x, reps)
+    assert res.size == 0
+    assert res.shape == (0, 3, 4)
+
+
+def test_tile_strides():
+    get_queue_or_skip()
+
+    reps = (1, 2)
+    x = dpt.reshape(dpt.arange(10 * 10, dtype="i4"), (10, 10))
+    x1 = x[:, ::-2]
+    expected_res = dpt.empty((10, 10), dtype="i4")
+    expected_res[:, : x1.shape[1]], expected_res[:, x1.shape[1] :] = x1, x1
+    res = dpt.tile(x1, reps)
+    assert dpt.all(res == expected_res)
+
+    reps = (2, 1)
+    x1 = x[::-2, :]
+    expected_res = dpt.empty((10, 10), dtype="i4")
+    expected_res[: x1.shape[0], :], expected_res[x1.shape[0] :, :] = x1, x1
+    res = dpt.tile(x1, reps)
+    assert dpt.all(res == expected_res)
+
+
+def test_tile_size_1_axes():
+    get_queue_or_skip()
+
+    reps = (1, 2, 1)
+    x = dpt.ones((2, 1, 3), dtype="i4")
+    res = dpt.tile(x, reps)
+    expected_res = dpt.broadcast_to(x, (2, 2, 3))
+    assert dpt.all(res == expected_res)
+
+
+def test_tile_arg_validation():
+    get_queue_or_skip()
+
+    with pytest.raises(TypeError):
+        dpt.tile(dict(), 2)
+
+    # repetitions must be int or tuple
+    x = dpt.empty(())
+    with pytest.raises(TypeError):
+        dpt.tile(x, dict())
+
+
+def test_repeat_0_size():
+    get_queue_or_skip()
+
+    x = dpt.ones((0, 10, 0), dtype="i4")
+    repetitions = 2
+    res = dpt.repeat(x, repetitions)
+    assert res.shape == (0,)
+    res = dpt.repeat(x, repetitions, axis=2)
+    assert res.shape == x.shape
+    res = dpt.repeat(x, repetitions, axis=1)
+    axis_sz = x.shape[1] * repetitions
+    assert res.shape == (0, 20, 0)
+
+    repetitions = dpt.asarray(2, dtype="i4")
+    res = dpt.repeat(x, repetitions)
+    assert res.shape == (0,)
+    res = dpt.repeat(x, repetitions, axis=2)
+    assert res.shape == x.shape
+    res = dpt.repeat(x, repetitions, axis=1)
+    assert res.shape == (0, 20, 0)
+
+    repetitions = dpt.arange(10, dtype="i4")
+    res = dpt.repeat(x, repetitions, axis=1)
+    axis_sz = dpt.sum(repetitions)
+    assert res.shape == (0, axis_sz, 0)
+
+    repetitions = (2,) * 10
+    res = dpt.repeat(x, repetitions, axis=1)
+    axis_sz = 2 * x.shape[1]
+    assert res.shape == (0, axis_sz, 0)
+
+
+def test_result_type_bug_1874():
+    py_sc = True
+    np_sc = np.asarray([py_sc])[0]
+    dts_bool = [py_sc, np_sc]
+    py_sc = int(1)
+    np_sc = np.asarray([py_sc])[0]
+    dts_ints = [py_sc, np_sc]
+    dts_floats = [float(1), np.float64(1)]
+    dts_complexes = [complex(1), np.complex128(1)]
+
+    # iterate over two categories
+    for dts1, dts2 in itertools.product(
+        [dts_bool, dts_ints, dts_floats, dts_complexes], repeat=2
+    ):
+        res_dts = []
+        # iterate over Python scalar/NumPy scalar choices within categories
+        for dt1, dt2 in itertools.product(dts1, dts2):
+            res_dt = dpt.result_type(dt1, dt2)
+            res_dts.append(res_dt)
+        # check that all results are the same
+        assert res_dts and all(res_dts[0] == el for el in res_dts[1:])
diff --git a/dpnp/tests/tensor/test_usm_ndarray_operators.py b/dpnp/tests/tensor/test_usm_ndarray_operators.py
new file mode 100644
index 000000000000..8ac178def197
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_operators.py
@@ -0,0 +1,154 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import pytest
+
+import dpnp.tensor as dpt
+
+
+class Dummy:
+    @staticmethod
+    def abs(a):
+        return a
+
+    @staticmethod
+    def add(a, b):
+        if isinstance(a, dpt.usm_ndarray):
+            return a
+        else:
+            return b
+
+    @staticmethod
+    def subtract(a, b):
+        if isinstance(a, dpt.usm_ndarray):
+            return a
+        else:
+            return b
+
+    @staticmethod
+    def multiply(a, b):
+        if isinstance(a, dpt.usm_ndarray):
+            return a
+        else:
+            return b
+
+
+@pytest.mark.parametrize("namespace", [dpt, Dummy()])
+def test_fp_ops(namespace):
+    try:
+        X = dpt.ones(1)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X._set_namespace(namespace)
+    assert X.__array_namespace__() is namespace
+    X[0] = -2.5
+    X.__abs__()
+    X.__add__(1.0)
+    X.__radd__(1.0)
+    X.__sub__(1.0)
+    X.__rsub__(1.0)
+    X.__mul__(1.0)
+    X.__rmul__(1.0)
+    X.__truediv__(1.0)
+    X.__rtruediv__(1.0)
+    X.__floordiv__(1.0)
+    X.__rfloordiv__(1.0)
+    X.__pos__()
+    X.__neg__()
+    X.__eq__(-2.5)
+    X.__ne__(-2.5)
+    X.__le__(-2.5)
+    X.__ge__(-2.5)
+    X.__gt__(-2.0)
+    X.__iadd__(X)
+    X.__isub__(X)
+    X.__imul__(X)
+    X.__itruediv__(1.0)
+    X.__ifloordiv__(1.0)
+
+
+@pytest.mark.parametrize("namespace", [dpt, Dummy()])
+def test_int_ops(namespace):
+    try:
+        X = dpt.usm_ndarray(1, "i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X._set_namespace(namespace)
+    assert X.__array_namespace__() is namespace
+    X.__lshift__(2)
+    X.__rshift__(2)
+    X.__rlshift__(2)
+    X.__rrshift__(2)
+    X.__ilshift__(2)
+    X.__irshift__(2)
+    X.__and__(X)
+    X.__rand__(X)
+    X.__iand__(X)
+    X.__or__(X)
+    X.__ror__(X)
+    X.__ior__(X)
+    X.__xor__(X)
+    X.__rxor__(X)
+    X.__ixor__(X)
+    X.__invert__()
+    X.__mod__(5)
+    X.__rmod__(5)
+    X.__imod__(5)
+    X.__pow__(2)
+    X.__rpow__(2)
+    X.__ipow__(2)
+
+
+@pytest.mark.parametrize("namespace", [dpt, Dummy()])
+def test_mat_ops(namespace):
+    try:
+        M = dpt.eye(3, 3)
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    M._set_namespace(namespace)
+    assert M.__array_namespace__() is namespace
+    M.__matmul__(M)
+    M.__imatmul__(M)
+    M.__rmatmul__(M)
+
+
+@pytest.mark.parametrize("namespace", [dpt, Dummy()])
+def test_comp_ops(namespace):
+    try:
+        X = dpt.asarray(1, dtype="u8")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("No SYCL devices available")
+    X._set_namespace(namespace)
+    assert X.__array_namespace__() is namespace
+    assert X.__gt__(-1)
+    assert X.__ge__(-1)
+    assert not X.__lt__(-1)
+    assert not X.__le__(-1)
+    assert not X.__eq__(-1)
+    assert X.__ne__(-1)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_print.py b/dpnp/tests/tensor/test_usm_ndarray_print.py
new file mode 100644
index 000000000000..94dbfca7c198
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_print.py
@@ -0,0 +1,408 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+class TestPrint:
+    def setup_method(self):
+        self._retain_options = dpt.get_print_options()
+
+    def teardown_method(self):
+        dpt.set_print_options(**self._retain_options)
+
+
+class TestArgValidation(TestPrint):
+    @pytest.mark.parametrize(
+        "arg,err",
+        [
+            ({"linewidth": "I"}, TypeError),
+            ({"edgeitems": "I"}, TypeError),
+            ({"threshold": "I"}, TypeError),
+            ({"precision": "I"}, TypeError),
+            ({"floatmode": "I"}, ValueError),
+            ({"edgeitems": "I"}, TypeError),
+            ({"sign": "I"}, ValueError),
+            ({"nanstr": np.nan}, TypeError),
+            ({"infstr": np.nan}, TypeError),
+        ],
+    )
+    def test_print_option_arg_validation(self, arg, err):
+        with pytest.raises(err):
+            dpt.set_print_options(**arg)
+
+    def test_usm_ndarray_repr_arg_validation(self):
+        X = {}
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_repr(X)
+
+        try:
+            X = dpt.arange(4)
+        except dpctl.SyclDeviceCreationError:
+            pytest.skip("No SYCL devices available")
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_repr(X, line_width="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_repr(X, precision="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_repr(X, prefix=4)
+
+    def test_usm_ndarray_str_arg_validation(self):
+        X = {}
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X)
+
+        try:
+            X = dpt.arange(4)
+        except dpctl.SyclDeviceCreationError:
+            pytest.skip("No SYCL devices available")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, line_width="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, edge_items="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, threshold="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, precision="I")
+
+        with pytest.raises(ValueError):
+            dpt.usm_ndarray_str(X, floatmode="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, edge_items="I")
+
+        with pytest.raises(ValueError):
+            dpt.usm_ndarray_str(X, sign="I")
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, prefix=4)
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, prefix=4)
+
+        with pytest.raises(TypeError):
+            dpt.usm_ndarray_str(X, suffix=4)
+
+
+class TestSetPrintOptions(TestPrint):
+    def test_set_linewidth(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(linewidth=1)
+        x = dpt.asarray([0, 1], sycl_queue=q)
+        assert str(x) == "[0\n 1]"
+
+    def test_set_precision(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(precision=4)
+        x = dpt.asarray([1.23450], sycl_queue=q)
+        assert str(x) == "[1.2345]"
+
+    def test_threshold_edgeitems(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(threshold=1, edgeitems=1)
+        x = dpt.arange(9, sycl_queue=q)
+        assert str(x) == "[0 ... 8]"
+        dpt.set_print_options(edgeitems=9)
+        assert str(x) == "[0 1 2 3 4 5 6 7 8]"
+
+    def test_floatmodes(self):
+        q = get_queue_or_skip()
+
+        x = dpt.asarray([0.1234, 0.1234678], sycl_queue=q)
+        dpt.set_print_options(floatmode="fixed", precision=4)
+        assert str(x) == "[0.1234 0.1235]"
+
+        dpt.set_print_options(floatmode="unique")
+        assert str(x) == "[0.1234    0.1234678]"
+
+        dpt.set_print_options(floatmode="maxprec")
+        assert str(x) == "[0.1234 0.1235]"
+
+        dpt.set_print_options(floatmode="maxprec", precision=8)
+        assert str(x) == "[0.1234    0.1234678]"
+
+        dpt.set_print_options(floatmode="maxprec_equal", precision=4)
+        assert str(x) == "[0.1234 0.1235]"
+
+        dpt.set_print_options(floatmode="maxprec_equal", precision=8)
+        assert str(x) == "[0.1234000 0.1234678]"
+
+    def test_nan_inf_suppress(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(nanstr="nan1", infstr="inf1")
+        x = dpt.asarray([np.nan, np.inf], sycl_queue=q)
+        assert str(x) == "[nan1 inf1]"
+
+    def test_suppress_small(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(suppress=True)
+        x = dpt.asarray(5e-10, sycl_queue=q)
+        assert str(x) == "0."
+
+    def test_sign(self):
+        q = get_queue_or_skip()
+
+        x = dpt.asarray([0.0, 1.0, 2.0], sycl_queue=q)
+        y = dpt.asarray(1.0, sycl_queue=q)
+        z = dpt.asarray([1.0 + 1.0j], sycl_queue=q)
+        assert str(x) == "[0. 1. 2.]"
+        assert str(y) == "1."
+        assert str(z) == "[1.+1.j]"
+
+        dpt.set_print_options(sign="+")
+        assert str(x) == "[+0. +1. +2.]"
+        assert str(y) == "+1."
+        assert str(z) == "[+1.+1.j]"
+
+        dpt.set_print_options(sign=" ")
+        assert str(x) == "[ 0.  1.  2.]"
+        assert str(y) == " 1."
+        assert str(z) == "[ 1.+1.j]"
+
+    def test_numpy(self):
+        dpt.set_print_options(numpy=True)
+        options = dpt.get_print_options()
+        np_options = np.get_printoptions()
+        assert all(np_options[k] == options[k] for k in options.keys())
+
+
+class TestPrintFns(TestPrint):
+    @pytest.mark.parametrize(
+        "dtype,x_str",
+        [
+            ("b1", "[False  True  True  True]"),
+            ("i1", "[0 1 2 3]"),
+            ("u1", "[0 1 2 3]"),
+            ("i2", "[0 1 2 3]"),
+            ("u2", "[0 1 2 3]"),
+            ("i4", "[0 1 2 3]"),
+            ("u4", "[0 1 2 3]"),
+            ("i8", "[0 1 2 3]"),
+            ("u8", "[0 1 2 3]"),
+            ("f2", "[0. 1. 2. 3.]"),
+            ("f4", "[0. 1. 2. 3.]"),
+            ("f8", "[0. 1. 2. 3.]"),
+            ("c8", "[0.+0.j 1.+0.j 2.+0.j 3.+0.j]"),
+            ("c16", "[0.+0.j 1.+0.j 2.+0.j 3.+0.j]"),
+        ],
+    )
+    def test_print_types(self, dtype, x_str):
+        q = get_queue_or_skip()
+        skip_if_dtype_not_supported(dtype, q)
+
+        x = dpt.asarray([0, 1, 2, 3], dtype=dtype, sycl_queue=q)
+        assert str(x) == x_str
+
+    def test_print_str(self):
+        q = get_queue_or_skip()
+
+        x = dpt.asarray(0, sycl_queue=q)
+        assert str(x) == "0"
+
+        x = dpt.asarray([np.nan, np.inf], sycl_queue=q)
+        assert str(x) == "[nan inf]"
+
+        x = dpt.arange(9, sycl_queue=q)
+        assert str(x) == "[0 1 2 3 4 5 6 7 8]"
+
+        y = dpt.reshape(x, (3, 3), copy=True)
+        assert str(y) == "[[0 1 2]\n [3 4 5]\n [6 7 8]]"
+
+    def test_print_str_abbreviated(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(threshold=0, edgeitems=1)
+        x = dpt.arange(9, sycl_queue=q)
+        assert str(x) == "[0 ... 8]"
+
+        x = dpt.reshape(x, (3, 3))
+        assert str(x) == "[[0 ... 2]\n ...\n [6 ... 8]]"
+
+    def test_usm_ndarray_str_separator(self):
+        q = get_queue_or_skip()
+
+        x = dpt.reshape(dpt.arange(4, sycl_queue=q), (2, 2))
+
+        np.testing.assert_equal(
+            dpt.usm_ndarray_str(x, prefix="test", separator="   "),
+            "[[0   1]\n     [2   3]]",
+        )
+
+    def test_print_repr(self):
+        q = get_queue_or_skip()
+
+        x = dpt.asarray(3, dtype="int64", sycl_queue=q)
+        assert repr(x) == "usm_ndarray(3)"
+
+        x = dpt.asarray([np.nan, np.inf], sycl_queue=q)
+        if x.sycl_device.has_aspect_fp64:
+            assert repr(x) == "usm_ndarray([nan, inf])"
+        else:
+            assert repr(x) == "usm_ndarray([nan, inf], dtype=float32)"
+
+        x = dpt.arange(9, sycl_queue=q, dtype="int64")
+        assert repr(x) == "usm_ndarray([0, 1, 2, 3, 4, 5, 6, 7, 8])"
+
+        x = dpt.reshape(x, (3, 3))
+        np.testing.assert_equal(
+            repr(x),
+            "usm_ndarray([[0, 1, 2],"
+            "\n             [3, 4, 5],"
+            "\n             [6, 7, 8]])",
+        )
+
+        x = dpt.arange(4, dtype="i4", sycl_queue=q)
+        assert repr(x) == "usm_ndarray([0, 1, 2, 3], dtype=int32)"
+
+        dpt.set_print_options(linewidth=1)
+        np.testing.assert_equal(
+            repr(x),
+            "usm_ndarray([0,"
+            "\n             1,"
+            "\n             2,"
+            "\n             3],"
+            "\n            dtype=int32)",
+        )
+
+        # zero-size array
+        dpt.set_print_options(linewidth=75)
+        x = dpt.ones((9, 0), dtype="i4", sycl_queue=q)
+        assert repr(x) == "usm_ndarray([], shape=(9, 0), dtype=int32)"
+
+    def test_print_repr_abbreviated(self):
+        q = get_queue_or_skip()
+
+        dpt.set_print_options(threshold=0, edgeitems=1)
+        x = dpt.arange(9, dtype="int64", sycl_queue=q)
+        assert repr(x) == "usm_ndarray([0, ..., 8], shape=(9,))"
+
+        y = dpt.asarray(x, dtype="i4", copy=True)
+        assert repr(y) == "usm_ndarray([0, ..., 8], shape=(9,), dtype=int32)"
+
+        x = dpt.reshape(x, (3, 3))
+        np.testing.assert_equal(
+            repr(x),
+            "usm_ndarray([[0, ..., 2],"
+            "\n             ...,"
+            "\n             [6, ..., 8]], shape=(3, 3))",
+        )
+
+        y = dpt.reshape(y, (3, 3))
+        np.testing.assert_equal(
+            repr(y),
+            "usm_ndarray([[0, ..., 2],"
+            "\n             ...,"
+            "\n             [6, ..., 8]], shape=(3, 3), dtype=int32)",
+        )
+
+        dpt.set_print_options(linewidth=1)
+        np.testing.assert_equal(
+            repr(y),
+            "usm_ndarray([[0,"
+            "\n              ...,"
+            "\n              2],"
+            "\n             ...,"
+            "\n             [6,"
+            "\n              ...,"
+            "\n              8]],"
+            "\n            shape=(3, 3),"
+            "\n            dtype=int32)",
+        )
+
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            "i1",
+            "u1",
+            "i2",
+            "u2",
+            "i4",
+            "u4",
+            "u8",
+            "f2",
+            "f4",
+            "c8",
+        ],
+    )
+    def test_repr_appended_dtype(self, dtype):
+        q = get_queue_or_skip()
+        skip_if_dtype_not_supported(dtype, q)
+
+        x = dpt.empty(4, dtype=dtype)
+        assert repr(x).split("=")[-1][:-1] == x.dtype.name
+
+    def test_usm_ndarray_repr_prefix(self):
+        q = get_queue_or_skip()
+
+        x = dpt.arange(4, dtype=np.intp, sycl_queue=q)
+        np.testing.assert_equal(
+            dpt.usm_ndarray_repr(x, prefix="test"), "test([0, 1, 2, 3])"
+        )
+        x = dpt.reshape(x, (2, 2))
+        np.testing.assert_equal(
+            dpt.usm_ndarray_repr(x, prefix="test"),
+            "test([[0, 1]," "\n      [2, 3]])",
+        )
+
+
+class TestContextManager:
+    def test_context_manager_basic(self):
+        options = dpt.get_print_options()
+        try:
+            X = dpt.asarray(1.234567)
+        except dpctl.SyclDeviceCreationError:
+            pytest.skip("No SYCL devices available")
+        with dpt.print_options(precision=4):
+            s = str(X)
+        assert s == "1.2346"
+        assert options == dpt.get_print_options()
+
+    def test_context_manager_as(self):
+        with dpt.print_options(precision=4) as x:
+            options = x.copy()
+        assert options["precision"] == 4
diff --git a/dpnp/tests/tensor/test_usm_ndarray_reductions.py b/dpnp/tests/tensor/test_usm_ndarray_reductions.py
new file mode 100644
index 000000000000..2c431efa936d
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_reductions.py
@@ -0,0 +1,704 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from random import randrange
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpnp.tensor as dpt
+from dpnp.tensor._tensor_impl import default_device_index_type
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_no_complex_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+]
+
+_all_dtypes = _no_complex_dtypes + [
+    "c8",
+    "c16",
+]
+
+
+def test_max_min_axis():
+    get_queue_or_skip()
+
+    x = dpt.reshape(
+        dpt.arange((3 * 4 * 5 * 6 * 7), dtype="i4"), (3, 4, 5, 6, 7)
+    )
+
+    m = dpt.max(x, axis=(1, 2, -1))
+    assert m.shape == (3, 6)
+    assert dpt.all(m == x[:, -1, -1, :, -1])
+
+    m = dpt.min(x, axis=(1, 2, -1))
+    assert m.shape == (3, 6)
+    assert dpt.all(m == x[:, 0, 0, :, 0])
+
+
+def test_max_axis1_axis0():
+    """See gh-1455"""
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(3 * 4 * 5), (3, 4, 5))
+
+    m = dpt.max(x, axis=0)
+    assert dpt.all(m == x[-1, :, :])
+
+    x = dpt.flip(x, axis=2)
+    m = dpt.max(x, axis=2)
+    assert dpt.all(m == x[:, :, 0])
+
+
+def test_reduction_keepdims():
+    get_queue_or_skip()
+
+    n0, n1 = 3, 6
+    x = dpt.ones((n0, 4, 5, n1, 7), dtype="i4")
+    m = dpt.max(x, axis=(1, 2, -1), keepdims=True)
+
+    xx = dpt.reshape(dpt.permute_dims(x, (0, 3, 1, 2, -1)), (n0, n1, -1))
+    p = dpt.argmax(xx, axis=-1, keepdims=True)
+
+    assert m.shape == (n0, 1, 1, n1, 1)
+    assert dpt.all(m == dpt.reshape(x[:, 0, 0, :, 0], m.shape))
+    assert dpt.all(p == 0)
+
+
+def test_max_scalar():
+    get_queue_or_skip()
+
+    x = dpt.ones(())
+    m = dpt.max(x)
+
+    assert m.shape == ()
+    assert x == m
+
+
+@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"])
+def test_reduction_kernels(arg_dtype):
+    # i4 - always uses atomics w/ sycl group reduction
+    # f4 - always uses atomics w/ custom group reduction
+    # c8 - always uses temps w/ custom group reduction
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q)
+    x[x.shape[0] // 2, :] = 3
+    x[:, x.shape[1] // 2] = 3
+
+    m = dpt.max(x)
+    assert m == 3
+    m = dpt.max(x, axis=0)
+    assert dpt.all(m == 3)
+    m = dpt.max(x, axis=1)
+    assert dpt.all(m == 3)
+
+    x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q)
+    x[x.shape[0] // 2, :] = 0
+    x[:, x.shape[1] // 2] = 0
+
+    m = dpt.min(x)
+    assert m == 0
+    m = dpt.min(x, axis=0)
+    assert dpt.all(m == 0)
+    m = dpt.min(x, axis=1)
+    assert dpt.all(m == 0)
+
+
+def test_max_min_nan_propagation():
+    get_queue_or_skip()
+
+    # float, finites
+    x = dpt.arange(4, dtype="f4")
+    x[0] = dpt.nan
+    assert dpt.isnan(dpt.max(x))
+    assert dpt.isnan(dpt.min(x))
+
+    # float, infinities
+    x[1:] = dpt.inf
+    assert dpt.isnan(dpt.max(x))
+    x[1:] = -dpt.inf
+    assert dpt.isnan(dpt.min(x))
+
+    # complex
+    x = dpt.arange(4, dtype="c8")
+    x[0] = complex(dpt.nan, 0)
+    assert dpt.isnan(dpt.max(x))
+    assert dpt.isnan(dpt.min(x))
+
+    x[0] = complex(0, dpt.nan)
+    assert dpt.isnan(dpt.max(x))
+    assert dpt.isnan(dpt.min(x))
+
+
+def test_argmax_scalar():
+    get_queue_or_skip()
+
+    x = dpt.ones(())
+    m = dpt.argmax(x)
+
+    assert m.shape == ()
+    assert m == 0
+
+
+@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"])
+def test_search_reduction_kernels(arg_dtype):
+    # i4 - always uses atomics w/ sycl group reduction
+    # f4 - always uses atomics w/ custom group reduction
+    # c8 - always uses temps w/ custom group reduction
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    x_shape = (24, 1024)
+    x_size = np.prod(x_shape)
+    x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q)
+    idx = randrange(x.size)
+    idx_tup = np.unravel_index(idx, x_shape)
+    x[idx] = 2
+
+    m = dpt.argmax(x)
+    assert m == idx
+
+    # test case of strided input mapping to contig
+    # implementation
+    m = dpt.argmax(dpt.flip(x))
+    assert m == x.size - 1 - idx
+
+    # test case of strided implementation
+    y = dpt.ones(2 * x.size, dtype=arg_dtype, sycl_queue=q)
+    y[::2] = x
+    m = dpt.argmax(y)
+    assert m == 2 * idx
+
+    x = dpt.reshape(x, x_shape)
+
+    x[idx_tup[0], :] = 3
+    m = dpt.argmax(x, axis=0)
+    assert dpt.all(m == idx_tup[0])
+    x[:, idx_tup[1]] = 4
+    m = dpt.argmax(x, axis=1)
+    assert dpt.all(m == idx_tup[1])
+
+    x = x[:, ::-2]
+    idx = randrange(x.shape[1])
+    x[:, idx] = 5
+    m = dpt.argmax(x, axis=1)
+    assert dpt.all(m == idx)
+
+    x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q)
+    idx = randrange(x.size)
+    idx_tup = np.unravel_index(idx, x_shape)
+    x[idx] = 0
+
+    m = dpt.argmin(x)
+    assert m == idx
+
+    x = dpt.reshape(x, x_shape)
+
+    x[idx_tup[0], :] = -1
+    m = dpt.argmin(x, axis=0)
+    assert dpt.all(m == idx_tup[0])
+    x[:, idx_tup[1]] = -2
+    m = dpt.argmin(x, axis=1)
+    assert dpt.all(m == idx_tup[1])
+
+    x = x[:, ::-2]
+    idx = randrange(x.shape[1])
+    x[:, idx] = -3
+    m = dpt.argmin(x, axis=1)
+    assert dpt.all(m == idx)
+
+
+def test_argmax_argmin_nan_propagation():
+    get_queue_or_skip()
+
+    sz = 4
+    idx = randrange(sz)
+    # floats
+    x = dpt.arange(sz, dtype="f4")
+    x[idx] = dpt.nan
+    assert dpt.argmax(x) == idx
+    assert dpt.argmin(x) == idx
+
+    # complex
+    x = dpt.arange(sz, dtype="c8")
+    x[idx] = complex(dpt.nan, 0)
+    assert dpt.argmax(x) == idx
+    assert dpt.argmin(x) == idx
+
+    x[idx] = complex(0, dpt.nan)
+    assert dpt.argmax(x) == idx
+    assert dpt.argmin(x) == idx
+
+
+def test_argmax_argmin_identities():
+    # make sure that identity arrays work as expected
+    get_queue_or_skip()
+
+    x = dpt.full(3, dpt.iinfo(dpt.int32).min, dtype="i4")
+    assert dpt.argmax(x) == 0
+    x = dpt.full(3, dpt.iinfo(dpt.int32).max, dtype="i4")
+    assert dpt.argmin(x) == 0
+
+
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_argmax_axis0_axis1(order):
+    get_queue_or_skip()
+
+    x = dpt.asarray([[1, 2, 3], [6, 5, 4]], dtype="i4", order=order)
+    assert dpt.argmax(x) == 3
+
+    res = dpt.argmax(x, axis=0)
+    expected = dpt.asarray([1, 1, 1], dtype=res.dtype)
+    assert dpt.all(res == expected)
+
+    res = dpt.argmax(x, axis=1)
+    expected = dpt.asarray([2, 0], dtype=res.dtype)
+    assert dpt.all(res == expected)
+
+
+def test_reduction_arg_validation():
+    get_queue_or_skip()
+
+    x = {}
+    with pytest.raises(TypeError):
+        dpt.sum(x)
+    with pytest.raises(TypeError):
+        dpt.max(x)
+    with pytest.raises(TypeError):
+        dpt.argmax(x)
+
+    x = dpt.zeros((0,), dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.max(x)
+    with pytest.raises(ValueError):
+        dpt.argmax(x)
+
+
+@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
+def test_logsumexp_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.logsumexp(m)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype.kind == "f"
+    tol = dpt.finfo(r.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(r),
+        np.logaddexp.reduce(dpt.asnumpy(m), dtype=r.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+
+def test_logsumexp_empty():
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="f4")
+    y = dpt.logsumexp(x)
+    assert y.shape == ()
+    assert y == -dpt.inf
+
+
+def test_logsumexp_axis():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
+    s = dpt.logsumexp(m, axis=(1, 2, -1))
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 6)
+    tol = dpt.finfo(s.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(s),
+        np.logaddexp.reduce(dpt.asnumpy(m), axis=(1, 2, -1), dtype=s.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+
+@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_logsumexp_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.logsumexp(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+
+
+def test_logsumexp_keepdims():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
+    s = dpt.logsumexp(m, axis=(1, 2, -1), keepdims=True)
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 1, 1, 6, 1)
+
+
+def test_logsumexp_keepdims_zero_size():
+    get_queue_or_skip()
+    n = 10
+    a = dpt.ones((n, 0, n))
+
+    s1 = dpt.logsumexp(a, keepdims=True)
+    assert s1.shape == (1, 1, 1)
+
+    s2 = dpt.logsumexp(a, axis=(0, 1), keepdims=True)
+    assert s2.shape == (1, 1, n)
+
+    s3 = dpt.logsumexp(a, axis=(1, 2), keepdims=True)
+    assert s3.shape == (n, 1, 1)
+
+    s4 = dpt.logsumexp(a, axis=(0, 2), keepdims=True)
+    assert s4.shape == (1, 0, 1)
+
+    a0 = a[0]
+    s5 = dpt.logsumexp(a0, keepdims=True)
+    assert s5.shape == (1, 1)
+
+
+def test_logsumexp_scalar():
+    get_queue_or_skip()
+
+    m = dpt.ones(())
+    s = dpt.logsumexp(m)
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert m.sycl_queue == s.sycl_queue
+    assert s.shape == ()
+
+
+def test_logsumexp_complex():
+    get_queue_or_skip()
+
+    x = dpt.zeros(1, dtype="c8")
+    with pytest.raises(ValueError):
+        dpt.logsumexp(x)
+
+
+def test_logsumexp_int_axis():
+    get_queue_or_skip()
+
+    x = dpt.zeros((8, 10), dtype="f4")
+    res = dpt.logsumexp(x, axis=0)
+    assert res.ndim == 1
+    assert res.shape[0] == 10
+
+
+def test_logsumexp_invalid_arr():
+    x = {}
+    with pytest.raises(TypeError):
+        dpt.logsumexp(x)
+
+
+@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
+def test_hypot_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.reduce_hypot(m)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype.kind == "f"
+    tol = dpt.finfo(r.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(r),
+        np.hypot.reduce(dpt.asnumpy(m), dtype=r.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+
+def test_hypot_empty():
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="f4")
+    y = dpt.reduce_hypot(x)
+    assert y.shape == ()
+    assert y == 0
+
+
+@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_hypot_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.reduce_hypot(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+
+
+def test_hypot_complex():
+    get_queue_or_skip()
+
+    x = dpt.zeros(1, dtype="c8")
+    with pytest.raises(ValueError):
+        dpt.reduce_hypot(x)
+
+
+def test_tree_reduction_axis1_axis0():
+    """See gh-1455"""
+    get_queue_or_skip()
+
+    x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5))
+
+    m = dpt.logsumexp(x, axis=0)
+    tol = dpt.finfo(m.dtype).resolution
+    assert_allclose(
+        dpt.asnumpy(m),
+        np.logaddexp.reduce(dpt.asnumpy(x), axis=0, dtype=m.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+    x = dpt.flip(x, axis=2)
+    m = dpt.logsumexp(x, axis=2)
+    assert_allclose(
+        dpt.asnumpy(m),
+        np.logaddexp.reduce(dpt.asnumpy(x), axis=2, dtype=m.dtype),
+        rtol=tol,
+        atol=tol,
+    )
+
+
+def test_numeric_reduction_out_kwarg():
+    get_queue_or_skip()
+
+    n1, n2, n3 = 3, 4, 5
+    x = dpt.ones((n1, n2, n3), dtype="i8")
+    out = dpt.zeros((2 * n1, 3 * n2), dtype="i8")
+    res = dpt.sum(x, axis=-1, out=out[::-2, 1::3])
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == res)
+    assert dpt.all(out[::-2, 1::3] == 5)
+
+    out = dpt.zeros((2 * n1, 3 * n2, 1), dtype="i8")
+    res = dpt.sum(x, axis=-1, keepdims=True, out=out[::-2, 1::3])
+    assert res.shape == (n1, n2, 1)
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == res)
+    assert dpt.all(out[::-2, 1::3] == 5)
+
+    res = dpt.sum(x, axis=0, out=x[-1])
+    assert dpt.all(x[-1] == res)
+    assert dpt.all(x[-1] == 3)
+    assert dpt.all(x[0:-1] == 1)
+
+    # test no-op case
+    x = dpt.ones((n1, n2, n3), dtype="i8")
+    out = dpt.zeros((2 * n1, 3 * n2, n3), dtype="i8")
+    res = dpt.sum(x, axis=(), out=out[::-2, 1::3])
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == x)
+
+    # test with dtype kwarg
+    x = dpt.ones((n1, n2, n3), dtype="i4")
+    out = dpt.zeros((2 * n1, 3 * n2), dtype="f4")
+    res = dpt.sum(x, axis=-1, dtype="f4", out=out[::-2, 1::3])
+    zero_res = dpt.zeros_like(res)
+    assert dpt.allclose(out[::-2, 0::3], zero_res)
+    assert dpt.allclose(out[::-2, 2::3], zero_res)
+    assert dpt.allclose(out[::-2, 1::3], res)
+    assert dpt.allclose(out[::-2, 1::3], dpt.full_like(res, 5, dtype="f4"))
+
+
+def test_comparison_reduction_out_kwarg():
+    get_queue_or_skip()
+
+    n1, n2, n3 = 3, 4, 5
+    x = dpt.reshape(dpt.arange(n1 * n2 * n3, dtype="i4"), (n1, n2, n3))
+    out = dpt.zeros((2 * n1, 3 * n2), dtype="i4")
+    res = dpt.max(x, axis=-1, out=out[::-2, 1::3])
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == res)
+    assert dpt.all(out[::-2, 1::3] == x[:, :, -1])
+
+    out = dpt.zeros((2 * n1, 3 * n2, 1), dtype="i4")
+    res = dpt.max(x, axis=-1, keepdims=True, out=out[::-2, 1::3])
+    assert res.shape == (n1, n2, 1)
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == res)
+    assert dpt.all(out[::-2, 1::3] == x[:, :, -1, dpt.newaxis])
+
+    # test no-op case
+    out = dpt.zeros((2 * n1, 3 * n2, n3), dtype="i4")
+    res = dpt.max(x, axis=(), out=out[::-2, 1::3])
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == x)
+
+    # test overlap
+    res = dpt.max(x, axis=0, out=x[0])
+    assert dpt.all(x[0] == res)
+    assert dpt.all(x[0] == x[-1])
+
+
+def test_search_reduction_out_kwarg():
+    get_queue_or_skip()
+
+    n1, n2, n3 = 3, 4, 5
+    dt = dpt.__array_namespace_info__().default_dtypes()["indexing"]
+
+    x = dpt.reshape(dpt.arange(n1 * n2 * n3, dtype=dt), (n1, n2, n3))
+    out = dpt.zeros((2 * n1, 3 * n2), dtype=dt)
+    res = dpt.argmax(x, axis=-1, out=out[::-2, 1::3])
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == res)
+    assert dpt.all(out[::-2, 1::3] == n2)
+
+    out = dpt.zeros((2 * n1, 3 * n2, 1), dtype=dt)
+    res = dpt.argmax(x, axis=-1, keepdims=True, out=out[::-2, 1::3])
+    assert res.shape == (n1, n2, 1)
+    assert dpt.all(out[::-2, 0::3] == 0)
+    assert dpt.all(out[::-2, 2::3] == 0)
+    assert dpt.all(out[::-2, 1::3] == res)
+    assert dpt.all(out[::-2, 1::3] == n3 - 1)
+
+    # test no-op case
+    x = dpt.ones((), dtype=dt)
+    out = dpt.ones(2, dtype=dt)
+    res = dpt.argmax(x, axis=None, out=out[1])
+    assert dpt.all(out[0] == 1)
+    assert dpt.all(out[1] == 0)
+
+    # test overlap
+    x = dpt.reshape(dpt.arange(n1 * n2, dtype=dt), (n1, n2))
+    res = dpt.argmax(x, axis=0, out=x[0])
+    assert dpt.all(x[0] == res)
+    assert dpt.all(x[0] == n1 - 1)
+
+
+def test_reduction_out_kwarg_arg_validation():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    ind_dt = dpt.__array_namespace_info__().default_dtypes()["indexing"]
+
+    x = dpt.ones(10, dtype="f4")
+    out_wrong_queue = dpt.empty((), dtype="f4", sycl_queue=q2)
+    out_wrong_dtype = dpt.empty((), dtype="i4", sycl_queue=q1)
+    out_wrong_shape = dpt.empty(1, dtype="f4", sycl_queue=q1)
+    out_wrong_keepdims = dpt.empty((), dtype="f4", sycl_queue=q1)
+    out_not_writable = dpt.empty((), dtype="f4", sycl_queue=q1)
+    out_not_writable.flags["W"] = False
+
+    with pytest.raises(TypeError):
+        dpt.sum(x, out=dict())
+    with pytest.raises(TypeError):
+        dpt.max(x, out=dict())
+    with pytest.raises(TypeError):
+        dpt.argmax(x, out=dict())
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.sum(x, out=out_wrong_queue)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.max(x, out=out_wrong_queue)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.argmax(x, out=dpt.empty_like(out_wrong_queue, dtype=ind_dt))
+    with pytest.raises(ValueError):
+        dpt.sum(x, out=out_wrong_dtype)
+    with pytest.raises(ValueError):
+        dpt.max(x, out=out_wrong_dtype)
+    with pytest.raises(ValueError):
+        dpt.argmax(x, out=dpt.empty_like(out_wrong_dtype, dtype="f4"))
+    with pytest.raises(ValueError):
+        dpt.sum(x, out=out_wrong_shape)
+    with pytest.raises(ValueError):
+        dpt.max(x, out=out_wrong_shape)
+    with pytest.raises(ValueError):
+        dpt.argmax(x, out=dpt.empty_like(out_wrong_shape, dtype=ind_dt))
+    with pytest.raises(ValueError):
+        dpt.sum(x, out=out_not_writable)
+    with pytest.raises(ValueError):
+        dpt.max(x, out=out_not_writable)
+    with pytest.raises(ValueError):
+        search_not_writable = dpt.empty_like(out_not_writable, dtype=ind_dt)
+        search_not_writable.flags["W"] = False
+        dpt.argmax(x, out=search_not_writable)
+    with pytest.raises(ValueError):
+        dpt.sum(x, keepdims=True, out=out_wrong_keepdims)
+    with pytest.raises(ValueError):
+        dpt.max(x, keepdims=True, out=out_wrong_keepdims)
+    with pytest.raises(ValueError):
+        dpt.argmax(
+            x,
+            keepdims=True,
+            out=dpt.empty_like(out_wrong_keepdims, dtype=ind_dt),
+        )
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_count_nonzero(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    expected_dt = default_device_index_type(q.sycl_device)
+
+    x = dpt.ones(10, dtype=dt, sycl_queue=q)
+    res = dpt.count_nonzero(x)
+    assert res == 10
+    assert res.dtype == expected_dt
+
+    x[3:6] = 0
+    res = dpt.count_nonzero(x)
+    assert res == 7
+    assert res.dtype == expected_dt
diff --git a/dpnp/tests/tensor/test_usm_ndarray_search_functions.py b/dpnp/tests/tensor/test_usm_ndarray_search_functions.py
new file mode 100644
index 000000000000..33942d93c3a7
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_search_functions.py
@@ -0,0 +1,593 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import ctypes
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+import dpnp.tensor as dpt
+from dpnp.tensor._search_functions import _where_result_type
+from dpnp.tensor._type_utils import _all_data_types
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "?",
+    "u1",
+    "i1",
+    "u2",
+    "i2",
+    "u4",
+    "i4",
+    "u8",
+    "i8",
+    "e",
+    "f",
+    "d",
+    "F",
+    "D",
+]
+
+
+class mock_device:
+    def __init__(self, fp16, fp64):
+        self.has_aspect_fp16 = fp16
+        self.has_aspect_fp64 = fp64
+
+
+def test_where_basic():
+    get_queue_or_skip()
+
+    cond = dpt.asarray(
+        [
+            [True, False, False],
+            [False, True, False],
+            [False, False, True],
+            [False, False, False],
+            [True, True, True],
+        ]
+    )
+    out = dpt.where(cond, dpt.asarray(1), dpt.asarray(0))
+    out_expected = dpt.asarray(
+        [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0], [1, 1, 1]]
+    )
+    assert (dpt.asnumpy(out) == dpt.asnumpy(out_expected)).all()
+
+    out = dpt.where(cond, dpt.ones(cond.shape), dpt.zeros(cond.shape))
+    assert (dpt.asnumpy(out) == dpt.asnumpy(out_expected)).all()
+
+    out = dpt.where(
+        cond,
+        dpt.ones(cond.shape[0], dtype="i4")[:, dpt.newaxis],
+        dpt.zeros(cond.shape[0], dtype="i4")[:, dpt.newaxis],
+    )
+    assert (dpt.asnumpy(out) == dpt.asnumpy(out_expected)).all()
+
+
+def _dtype_all_close(x1, x2):
+    if np.issubdtype(x2.dtype, np.floating) or np.issubdtype(
+        x2.dtype, np.complexfloating
+    ):
+        x2_dtype = x2.dtype
+        return np.allclose(
+            x1, x2, atol=np.finfo(x2_dtype).eps, rtol=np.finfo(x2_dtype).eps
+        )
+    else:
+        return np.allclose(x1, x2)
+
+
+@pytest.mark.parametrize("dt1", _all_dtypes)
+@pytest.mark.parametrize("dt2", _all_dtypes)
+@pytest.mark.parametrize("fp16", [True, False])
+@pytest.mark.parametrize("fp64", [True, False])
+def test_where_result_types(dt1, dt2, fp16, fp64):
+    dev = mock_device(fp16, fp64)
+
+    dt1 = dpt.dtype(dt1)
+    dt2 = dpt.dtype(dt2)
+    res_t = _where_result_type(dt1, dt2, dev)
+
+    if fp16 and fp64:
+        assert res_t == dpt.result_type(dt1, dt2)
+    else:
+        if res_t:
+            assert res_t.kind == dpt.result_type(dt1, dt2).kind
+        else:
+            # some illegal cases are covered above, but
+            # this guarantees that _where_result_type
+            # produces None only when one of the dtypes
+            # is illegal given fp aspects of device
+            all_dts = _all_data_types(fp16, fp64)
+            assert dt1 not in all_dts or dt2 not in all_dts
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_where_mask_dtypes(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    # mask dtype changes
+    cond = dpt.asarray([0, 1, 3, 0, 10], dtype=dt, sycl_queue=q)
+    x1 = dpt.asarray(0, dtype="f4", sycl_queue=q)
+    x2 = dpt.asarray(1, dtype="f4", sycl_queue=q)
+    res = dpt.where(cond, x1, x2)
+
+    res_check = np.asarray([1, 0, 0, 1, 0], dtype=res.dtype)
+    assert _dtype_all_close(dpt.asnumpy(res), res_check)
+
+    # contiguous cases
+    x1 = dpt.full(cond.shape, 0, dtype="f4", sycl_queue=q)
+    x2 = dpt.full(cond.shape, 1, dtype="f4", sycl_queue=q)
+    res = dpt.where(cond, x1, x2)
+    assert _dtype_all_close(dpt.asnumpy(res), res_check)
+
+    # input array dtype changes
+    cond = dpt.asarray([False, True, True, False, True], sycl_queue=q)
+    x1 = dpt.asarray(0, dtype=dt, sycl_queue=q)
+    x2 = dpt.asarray(1, dtype=dt, sycl_queue=q)
+    res = dpt.where(cond, x1, x2)
+
+    res_check = np.asarray([1, 0, 0, 1, 0], dtype=res.dtype)
+    assert _dtype_all_close(dpt.asnumpy(res), res_check)
+
+    # contiguous cases
+    x1 = dpt.full(cond.shape, 0, dtype=dt, sycl_queue=q)
+    x2 = dpt.full(cond.shape, 1, dtype=dt, sycl_queue=q)
+    res = dpt.where(cond, x1, x2)
+    assert _dtype_all_close(dpt.asnumpy(res), res_check)
+
+
+def test_where_asymmetric_dtypes():
+    q = get_queue_or_skip()
+
+    cond = dpt.asarray([0, 1, 3, 0, 10], dtype="?", sycl_queue=q)
+    x1 = dpt.asarray(2, dtype="i4", sycl_queue=q)
+    x2 = dpt.asarray(3, dtype="i8", sycl_queue=q)
+
+    res = dpt.where(cond, x1, x2)
+    res_check = np.asarray([3, 2, 2, 3, 2], dtype=res.dtype)
+    assert _dtype_all_close(dpt.asnumpy(res), res_check)
+
+    # flip order
+
+    res = dpt.where(cond, x2, x1)
+    res_check = np.asarray([2, 3, 3, 2, 3], dtype=res.dtype)
+    assert _dtype_all_close(dpt.asnumpy(res), res_check)
+
+
+def test_where_nan_inf():
+    get_queue_or_skip()
+
+    cond = dpt.asarray([True, False, True, False], dtype="?")
+    x1 = dpt.asarray([np.nan, 2.0, np.inf, 3.0], dtype="f4")
+    x2 = dpt.asarray([2.0, np.nan, 3.0, np.inf], dtype="f4")
+
+    cond_np = dpt.asnumpy(cond)
+    x1_np = dpt.asnumpy(x1)
+    x2_np = dpt.asnumpy(x2)
+
+    res = dpt.where(cond, x1, x2)
+    res_np = np.where(cond_np, x1_np, x2_np)
+
+    assert np.allclose(dpt.asnumpy(res), res_np, equal_nan=True)
+
+    res = dpt.where(x1, cond, x2)
+    res_np = np.where(x1_np, cond_np, x2_np)
+    assert _dtype_all_close(dpt.asnumpy(res), res_np)
+
+
+def test_where_empty():
+    # check that numpy returns same results when
+    # handling empty arrays
+    get_queue_or_skip()
+
+    empty = dpt.empty(0, dtype="i2")
+    m = dpt.asarray(True)
+    x1 = dpt.asarray(1, dtype="i2")
+    x2 = dpt.asarray(2, dtype="i2")
+    res = dpt.where(empty, x1, x2)
+
+    empty_np = np.empty(0, dtype="i2")
+    m_np = dpt.asnumpy(m)
+    x1_np = dpt.asnumpy(x1)
+    x2_np = dpt.asnumpy(x2)
+    res_np = np.where(empty_np, x1_np, x2_np)
+
+    assert_array_equal(dpt.asnumpy(res), res_np)
+
+    res = dpt.where(m, empty, x2)
+    res_np = np.where(m_np, empty_np, x2_np)
+
+    assert_array_equal(dpt.asnumpy(res), res_np)
+
+    # check that broadcasting is performed
+    with pytest.raises(ValueError):
+        dpt.where(empty, x1, dpt.empty((1, 2)))
+
+
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_where_contiguous(order):
+    get_queue_or_skip()
+
+    cond = dpt.asarray(
+        [
+            [[True, False, False], [False, True, True]],
+            [[False, True, False], [True, False, True]],
+            [[False, False, True], [False, False, True]],
+            [[False, False, False], [True, False, True]],
+            [[True, True, True], [True, False, True]],
+        ],
+        order=order,
+    )
+
+    x1 = dpt.full(cond.shape, 2, dtype="i4", order=order)
+    x2 = dpt.full(cond.shape, 3, dtype="i4", order=order)
+    expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2))
+    res = dpt.where(cond, x1, x2)
+
+    assert _dtype_all_close(dpt.asnumpy(res), expected)
+
+
+def test_where_contiguous1D():
+    get_queue_or_skip()
+
+    cond = dpt.asarray([True, False, True, False, False, True])
+
+    x1 = dpt.full(cond.shape, 2, dtype="i4")
+    x2 = dpt.full(cond.shape, 3, dtype="i4")
+    expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2))
+    res = dpt.where(cond, x1, x2)
+    assert_array_equal(dpt.asnumpy(res), expected)
+
+    # test with complex dtype (branch in kernel)
+    x1 = dpt.astype(x1, dpt.complex64)
+    x2 = dpt.astype(x2, dpt.complex64)
+    expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2))
+    res = dpt.where(cond, x1, x2)
+    assert _dtype_all_close(dpt.asnumpy(res), expected)
+
+
+def test_where_gh_1170():
+    get_queue_or_skip()
+
+    cond = dpt.asarray([False, True, True, False], dtype="?")
+    x1 = dpt.ones((3, 4), dtype="i4")
+    x2 = dpt.zeros((3, 4), dtype="i4")
+
+    res = dpt.where(cond, x1, x2)
+    expected = np.broadcast_to(dpt.asnumpy(cond).astype(x1.dtype), x1.shape)
+
+    assert_array_equal(dpt.asnumpy(res), expected)
+
+
+def test_where_strided():
+    get_queue_or_skip()
+
+    s0, s1 = 4, 9
+    cond = dpt.reshape(
+        dpt.asarray(
+            [True, False, False, False, True, True, False, True, False] * s0
+        ),
+        (s0, s1),
+    )[:, ::3]
+
+    x1 = dpt.reshape(
+        dpt.arange(cond.shape[0] * cond.shape[1] * 2, dtype="i4"),
+        (cond.shape[0], cond.shape[1] * 2),
+    )[:, ::2]
+    x2 = dpt.reshape(
+        dpt.arange(cond.shape[0] * cond.shape[1] * 3, dtype="i4"),
+        (cond.shape[0], cond.shape[1] * 3),
+    )[:, ::3]
+    expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2))
+    res = dpt.where(cond, x1, x2)
+
+    assert_array_equal(dpt.asnumpy(res), expected)
+
+    # negative strides
+    res = dpt.where(cond, dpt.flip(x1), x2)
+    expected = np.where(
+        dpt.asnumpy(cond), np.flip(dpt.asnumpy(x1)), dpt.asnumpy(x2)
+    )
+    assert_array_equal(dpt.asnumpy(res), expected)
+
+    res = dpt.where(dpt.flip(cond), x1, x2)
+    expected = np.where(
+        np.flip(dpt.asnumpy(cond)), dpt.asnumpy(x1), dpt.asnumpy(x2)
+    )
+    assert_array_equal(dpt.asnumpy(res), expected)
+
+
+def test_where_invariants():
+    get_queue_or_skip()
+
+    test_sh = (
+        6,
+        8,
+    )
+    mask = dpt.asarray(np.random.choice([True, False], size=test_sh))
+    p = dpt.ones(test_sh, dtype=dpt.int16)
+    m = dpt.full(test_sh, -1, dtype=dpt.int16)
+    inds_list = [
+        (
+            np.s_[:3],
+            np.s_[::2],
+        ),
+        (
+            np.s_[::2],
+            np.s_[::2],
+        ),
+        (
+            np.s_[::-1],
+            np.s_[:],
+        ),
+    ]
+    for ind in inds_list:
+        r1 = dpt.where(mask, p, m)[ind]
+        r2 = dpt.where(mask[ind], p[ind], m[ind])
+        assert (dpt.asnumpy(r1) == dpt.asnumpy(r2)).all()
+
+
+def test_where_arg_validation():
+    get_queue_or_skip()
+
+    check = {}
+    x1 = dpt.empty((1,), dtype="i4")
+    x2 = dpt.empty((1,), dtype="i4")
+
+    with pytest.raises(TypeError):
+        dpt.where(check, x1, x2)
+    with pytest.raises(ValueError):
+        dpt.where(x1, check, x2)
+    with pytest.raises(ValueError):
+        dpt.where(x1, x2, check)
+
+
+def test_where_compute_follows_data():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+    q3 = get_queue_or_skip()
+
+    x1 = dpt.empty((1,), dtype="i4", sycl_queue=q1)
+    x2 = dpt.empty((1,), dtype="i4", sycl_queue=q2)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.where(dpt.empty((1,), dtype="i4", sycl_queue=q1), x1, x2)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.where(dpt.empty((1,), dtype="i4", sycl_queue=q3), x1, x2)
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.where(x1, x1, x2)
+
+
+def test_where_order():
+    get_queue_or_skip()
+
+    test_sh = (
+        20,
+        20,
+    )
+    test_sh2 = tuple(2 * dim for dim in test_sh)
+    n = test_sh[-1]
+
+    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
+        ar1 = dpt.zeros(test_sh, dtype=dt1, order="C")
+        ar2 = dpt.ones(test_sh, dtype=dt2, order="C")
+        condition = dpt.zeros(test_sh, dtype="?", order="C")
+        res1 = dpt.where(condition, ar1, ar2, order="C")
+        assert res1.flags.c_contiguous
+        res2 = dpt.where(condition, ar1, ar2, order="F")
+        assert res2.flags.f_contiguous
+        res3 = dpt.where(condition, ar1, ar2, order="A")
+        assert res3.flags.c_contiguous
+        res4 = dpt.where(condition, ar1, ar2, order="K")
+        assert res4.flags.c_contiguous
+
+        ar1 = dpt.ones(test_sh, dtype=dt1, order="F")
+        ar2 = dpt.ones(test_sh, dtype=dt2, order="F")
+        condition = dpt.zeros(test_sh, dtype="?", order="F")
+        res1 = dpt.where(condition, ar1, ar2, order="C")
+        assert res1.flags.c_contiguous
+        res2 = dpt.where(condition, ar1, ar2, order="F")
+        assert res2.flags.f_contiguous
+        res3 = dpt.where(condition, ar1, ar2, order="A")
+        assert res2.flags.f_contiguous
+        res4 = dpt.where(condition, ar1, ar2, order="K")
+        assert res4.flags.f_contiguous
+
+        ar1 = dpt.ones(test_sh2, dtype=dt1, order="C")[:20, ::-2]
+        ar2 = dpt.ones(test_sh2, dtype=dt2, order="C")[:20, ::-2]
+        condition = dpt.zeros(test_sh2, dtype="?", order="C")[:20, ::-2]
+        res1 = dpt.where(condition, ar1, ar2, order="K")
+        assert res1.strides == (n, -1)
+        res2 = dpt.where(condition, ar1, ar2, order="C")
+        assert res2.strides == (n, 1)
+
+        ar1 = dpt.ones(test_sh2, dtype=dt1, order="C")[:20, ::-2].mT
+        ar2 = dpt.ones(test_sh2, dtype=dt2, order="C")[:20, ::-2].mT
+        condition = dpt.zeros(test_sh2, dtype="?", order="C")[:20, ::-2].mT
+        res1 = dpt.where(condition, ar1, ar2, order="K")
+        assert res1.strides == (-1, n)
+        res2 = dpt.where(condition, ar1, ar2, order="C")
+        assert res2.strides == (n, 1)
+
+        ar1 = dpt.ones(n, dtype=dt1, order="C")
+        ar2 = dpt.broadcast_to(dpt.ones(n, dtype=dt2, order="C"), test_sh)
+        condition = dpt.zeros(n, dtype="?", order="C")
+        res = dpt.where(condition, ar1, ar2, order="K")
+        assert res.strides == (20, 1)
+
+
+def test_where_unaligned():
+    get_queue_or_skip()
+
+    x = dpt.ones(513, dtype="i4")
+    a = dpt.full(512, 2, dtype="i4")
+    b = dpt.zeros(512, dtype="i4")
+
+    expected = dpt.full(512, 2, dtype="i4")
+    assert dpt.all(dpt.where(x[1:], a, b) == expected)
+
+
+def test_where_out():
+    get_queue_or_skip()
+
+    n1, n2, n3 = 3, 4, 5
+    ar1 = dpt.reshape(dpt.arange(n1 * n2 * n3, dtype="i4"), (n1, n2, n3))
+    ar2 = dpt.full_like(ar1, -5)
+    condition = dpt.tile(
+        dpt.reshape(
+            dpt.asarray([True, False, False, True], dtype="?"), (1, n2, 1)
+        ),
+        (n1, 1, n3),
+    )
+
+    out = dpt.zeros((2 * n1, 3 * n2, n3), dtype="i4")
+    res = dpt.where(condition, ar1, ar2, out=out[::-2, 1::3, :])
+
+    assert dpt.all(res == out[::-2, 1::3, :])
+    assert dpt.all(out[::-2, 0::3, :] == 0)
+    assert dpt.all(out[::-2, 2::3, :] == 0)
+
+    assert dpt.all(res[:, 1:3, :] == -5)
+    assert dpt.all(res[:, 0, :] == ar1[:, 0, :])
+    assert dpt.all(res[:, 3, :] == ar1[:, 3, :])
+
+    condition = dpt.tile(
+        dpt.reshape(dpt.asarray([1, 0], dtype="i4"), (1, 2, 1)),
+        (n1, 2, n3),
+    )
+    res = dpt.where(
+        condition[:, ::-1, :], condition[:, ::-1, :], condition, out=condition
+    )
+    assert dpt.all(res == condition)
+    assert dpt.all(condition == 1)
+
+    condition = dpt.tile(
+        dpt.reshape(dpt.asarray([True, False], dtype="?"), (1, 2, 1)),
+        (n1, 2, n3),
+    )
+    ar1 = dpt.full((n1, n2, n3), 7, dtype="i4")
+    ar2 = dpt.full_like(ar1, -5)
+    res = dpt.where(condition, ar1, ar2, out=ar2[:, ::-1, :])
+    assert dpt.all(ar2[:, ::-1, :] == res)
+    assert dpt.all(ar2[:, ::2, :] == -5)
+    assert dpt.all(ar2[:, 1::2, :] == 7)
+
+    condition = dpt.tile(
+        dpt.reshape(dpt.asarray([True, False], dtype="?"), (1, 2, 1)),
+        (n1, 2, n3),
+    )
+    ar1 = dpt.full((n1, n2, n3), 7, dtype="i4")
+    ar2 = dpt.full_like(ar1, -5)
+    res = dpt.where(condition, ar1, ar2, out=ar1[:, ::-1, :])
+    assert dpt.all(ar1[:, ::-1, :] == res)
+    assert dpt.all(ar1[:, ::2, :] == -5)
+    assert dpt.all(ar1[:, 1::2, :] == 7)
+
+
+def test_where_out_arg_validation():
+    q1 = get_queue_or_skip()
+    q2 = get_queue_or_skip()
+
+    condition = dpt.ones(5, dtype="i4", sycl_queue=q1)
+    x1 = dpt.ones(5, dtype="i4", sycl_queue=q1)
+    x2 = dpt.ones(5, dtype="i4", sycl_queue=q1)
+
+    out_wrong_queue = dpt.empty_like(condition, sycl_queue=q2)
+    out_wrong_dtype = dpt.empty_like(condition, dtype="f4")
+    out_wrong_shape = dpt.empty(6, dtype="i4", sycl_queue=q1)
+    out_not_writable = dpt.empty_like(condition)
+    out_not_writable.flags["W"] = False
+
+    with pytest.raises(TypeError):
+        dpt.where(condition, x1, x2, out=dict())
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.where(condition, x1, x2, out=out_wrong_queue)
+    with pytest.raises(ValueError):
+        dpt.where(condition, x1, x2, out=out_wrong_dtype)
+    with pytest.raises(ValueError):
+        dpt.where(condition, x1, x2, out=out_wrong_shape)
+    with pytest.raises(ValueError):
+        dpt.where(condition, x1, x2, out=out_not_writable)
+
+
+@pytest.mark.parametrize("arr_dt", _all_dtypes)
+def test_where_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    n1, n2 = 10, 10
+    condition = dpt.tile(
+        dpt.reshape(
+            dpt.asarray([True, False], dtype="?", sycl_queue=q), (1, 2)
+        ),
+        (n1, n2 // 2),
+    )
+    x = dpt.zeros((n1, n2), dtype=arr_dt, sycl_queue=q)
+    py_scalars = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_scalars:
+        r = dpt.where(condition, x, sc)
+        assert isinstance(r, dpt.usm_ndarray)
+        r = dpt.where(condition, sc, x)
+        assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_where_two_python_scalars():
+    get_queue_or_skip()
+
+    n1, n2 = 10, 10
+    condition = dpt.tile(
+        dpt.reshape(dpt.asarray([True, False], dtype="?"), (1, 2)),
+        (n1, n2 // 2),
+    )
+
+    py_scalars = [
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    ]
+
+    for sc1, sc2 in itertools.product(py_scalars, repeat=2):
+        r = dpt.where(condition, sc1, sc2)
+        assert isinstance(r, dpt.usm_ndarray)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_searchsorted.py b/dpnp/tests/tensor/test_usm_ndarray_searchsorted.py
new file mode 100644
index 000000000000..aef782f06f08
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_searchsorted.py
@@ -0,0 +1,407 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import numpy as np
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+def _check(hay_stack, needles, needles_np):
+    assert hay_stack.dtype == needles.dtype
+    assert hay_stack.ndim == 1
+
+    info_ = dpt.__array_namespace_info__()
+    default_dts_dev = info_.default_dtypes(device=hay_stack.device)
+    index_dt = default_dts_dev["indexing"]
+
+    p_left = dpt.searchsorted(hay_stack, needles, side="left")
+    assert p_left.dtype == index_dt
+
+    hs_np = dpt.asnumpy(hay_stack)
+    ref_left = np.searchsorted(hs_np, needles_np, side="left")
+    assert dpt.all(p_left == dpt.asarray(ref_left))
+
+    p_right = dpt.searchsorted(hay_stack, needles, side="right")
+    assert p_right.dtype == index_dt
+
+    ref_right = np.searchsorted(hs_np, needles_np, side="right")
+    assert dpt.all(p_right == dpt.asarray(ref_right))
+
+    sorter = dpt.arange(hay_stack.size)
+    ps_left = dpt.searchsorted(hay_stack, needles, side="left", sorter=sorter)
+    assert ps_left.dtype == index_dt
+    assert dpt.all(ps_left == p_left)
+    ps_right = dpt.searchsorted(hay_stack, needles, side="right", sorter=sorter)
+    assert ps_right.dtype == index_dt
+    assert dpt.all(ps_right == p_right)
+
+
+def test_searchsorted_contig_bool():
+    get_queue_or_skip()
+
+    dt = dpt.bool
+
+    hay_stack = dpt.arange(0, 1, dtype=dt)
+    needles_np = np.random.choice([True, False], size=1024)
+    needles = dpt.asarray(needles_np)
+
+    _check(hay_stack, needles, needles_np)
+    _check(
+        hay_stack,
+        dpt.reshape(needles, (32, 32)),
+        np.reshape(needles_np, (32, 32)),
+    )
+
+
+def test_searchsorted_strided_bool():
+    get_queue_or_skip()
+
+    dt = dpt.bool
+
+    hay_stack = dpt.repeat(dpt.arange(0, 1, dtype=dt), 4)[::4]
+    needles_np = np.random.choice([True, False], size=2 * 1024)
+    needles = dpt.asarray(needles_np)
+    sl = slice(None, None, -2)
+
+    _check(hay_stack, needles[sl], needles_np[sl])
+    _check(
+        hay_stack,
+        dpt.reshape(needles[sl], (32, 32)),
+        np.reshape(needles_np[sl], (32, 32)),
+    )
+
+
+@pytest.mark.parametrize(
+    "idt",
+    [
+        dpt.int8,
+        dpt.uint8,
+        dpt.int16,
+        dpt.uint16,
+        dpt.int32,
+        dpt.uint32,
+        dpt.int64,
+        dpt.uint64,
+    ],
+)
+def test_searchsorted_contig_int(idt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(idt, q)
+
+    dt = dpt.dtype(idt)
+    max_v = dpt.iinfo(dt).max
+
+    hay_stack = dpt.arange(0, min(max_v, 255), dtype=dt)
+    needles_np = np.random.randint(0, max_v, dtype=dt, size=1024)
+    needles = dpt.asarray(needles_np)
+
+    _check(hay_stack, needles, needles_np)
+    _check(
+        hay_stack,
+        dpt.reshape(needles, (32, 32)),
+        np.reshape(needles_np, (32, 32)),
+    )
+
+
+@pytest.mark.parametrize(
+    "idt",
+    [
+        dpt.int8,
+        dpt.uint8,
+        dpt.int16,
+        dpt.uint16,
+        dpt.int32,
+        dpt.uint32,
+        dpt.int64,
+        dpt.uint64,
+    ],
+)
+def test_searchsorted_strided_int(idt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(idt, q)
+
+    dt = dpt.dtype(idt)
+    max_v = dpt.iinfo(dt).max
+
+    hay_stack = dpt.repeat(dpt.arange(0, min(max_v, 255), dtype=dt), 4)[1::4]
+    needles_np = np.random.randint(0, max_v, dtype=dt, size=2 * 1024)
+    needles = dpt.asarray(needles_np)
+    sl = slice(None, None, -2)
+
+    _check(hay_stack, needles[sl], needles_np[sl])
+    _check(
+        hay_stack,
+        dpt.reshape(needles[sl], (32, 32)),
+        np.reshape(needles_np[sl], (32, 32)),
+    )
+
+
+def _add_extended_fp(array):
+    array[0] = -dpt.inf
+    array[-2] = dpt.inf
+    array[-1] = dpt.nan
+
+
+@pytest.mark.parametrize("idt", [dpt.float16, dpt.float32, dpt.float64])
+def test_searchsorted_contig_fp(idt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(idt, q)
+
+    dt = dpt.dtype(idt)
+
+    hay_stack = dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True)
+    _add_extended_fp(hay_stack)
+
+    needles_np = np.random.uniform(-0.1, 1.1, size=1024).astype(dt)
+    needles = dpt.asarray(needles_np)
+
+    _check(hay_stack, needles, needles_np)
+    _check(
+        hay_stack,
+        dpt.reshape(needles, (32, 32)),
+        np.reshape(needles_np, (32, 32)),
+    )
+
+
+@pytest.mark.parametrize("idt", [dpt.float16, dpt.float32, dpt.float64])
+def test_searchsorted_strided_fp(idt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(idt, q)
+
+    dt = dpt.dtype(idt)
+
+    hay_stack = dpt.repeat(
+        dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True), 4
+    )[1::4]
+    _add_extended_fp(hay_stack)
+
+    needles_np = np.random.uniform(-0.1, 1.1, size=3 * 1024).astype(dt)
+    needles = dpt.asarray(needles_np)
+    sl = slice(1, None, 3)
+
+    _check(hay_stack, needles[sl], needles_np[sl])
+    _check(
+        hay_stack,
+        dpt.reshape(needles[sl], (32, 32)),
+        np.reshape(needles_np[sl], (32, 32)),
+    )
+
+
+def _add_extended_cfp(array):
+    dt = array.dtype
+    ev_li = [
+        complex(-dpt.inf, -1),
+        complex(-dpt.inf, -dpt.inf),
+        complex(-dpt.inf, dpt.inf),
+        complex(-dpt.inf, dpt.nan),
+        complex(0, -dpt.inf),
+        complex(0, -1),
+        complex(0, dpt.inf),
+        complex(0, dpt.nan),
+        complex(dpt.inf, -dpt.inf),
+        complex(dpt.inf, -1),
+        complex(dpt.inf, dpt.inf),
+        complex(dpt.inf, dpt.nan),
+        complex(dpt.nan, -dpt.inf),
+        complex(dpt.nan, -1),
+        complex(dpt.nan, dpt.inf),
+        complex(dpt.nan, dpt.nan),
+    ]
+    ev = dpt.asarray(ev_li, dtype=dt, device=array.device)
+    return dpt.sort(dpt.concat((ev, array)))
+
+
+@pytest.mark.parametrize("idt", [dpt.complex64, dpt.complex128])
+def test_searchsorted_contig_cfp(idt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(idt, q)
+
+    dt = dpt.dtype(idt)
+
+    hay_stack = dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True)
+    hay_stack = _add_extended_cfp(hay_stack)
+    needles_np = np.random.uniform(-0.1, 1.1, size=1024).astype(dt)
+    needles = dpt.asarray(needles_np)
+
+    _check(hay_stack, needles, needles_np)
+    _check(
+        hay_stack,
+        dpt.reshape(needles, (32, 32)),
+        np.reshape(needles_np, (32, 32)),
+    )
+
+
+@pytest.mark.parametrize("idt", [dpt.complex64, dpt.complex128])
+def test_searchsorted_strided_cfp(idt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(idt, q)
+
+    dt = dpt.dtype(idt)
+
+    hay_stack = dpt.repeat(
+        dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True), 4
+    )[1::4]
+    needles_np = np.random.uniform(-0.1, 1.1, size=3 * 1024).astype(dt)
+    needles = dpt.asarray(needles_np)
+    sl = slice(1, None, 3)
+
+    _check(hay_stack, needles[sl], needles_np[sl])
+    _check(
+        hay_stack,
+        dpt.reshape(needles[sl], (32, 32)),
+        np.reshape(needles_np[sl], (32, 32)),
+    )
+
+    hay_stack = _add_extended_cfp(hay_stack)
+    _check(hay_stack, needles[sl], needles_np[sl])
+    _check(
+        hay_stack,
+        dpt.reshape(needles[sl], (32, 32)),
+        np.reshape(needles_np[sl], (32, 32)),
+    )
+
+
+def test_searchsorted_coerce():
+    get_queue_or_skip()
+
+    x1_i4 = dpt.arange(5, dtype="i4")
+    x1_i8 = dpt.arange(5, dtype="i8")
+    x2_i4 = dpt.arange(5, dtype="i4")
+    x2_i8 = dpt.arange(5, dtype="i8")
+
+    p1 = dpt.searchsorted(x1_i4, x2_i8)
+    p2 = dpt.searchsorted(x1_i8, x2_i8)
+    p3 = dpt.searchsorted(x1_i8, x2_i4)
+    assert dpt.all(p1 == p2)
+    assert dpt.all(p2 == p3)
+
+
+def test_searchsorted_validation():
+    with pytest.raises(TypeError):
+        dpt.searchsorted(None, None)
+    try:
+        x1 = dpt.arange(10, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("Default device could not be created")
+    with pytest.raises(TypeError):
+        dpt.searchsorted(x1, None)
+    with pytest.raises(TypeError):
+        dpt.searchsorted(x1, x1, sorter=dict())
+    with pytest.raises(ValueError):
+        dpt.searchsorted(x1, x1, side="unknown")
+
+
+def test_searchsorted_validation2():
+    try:
+        x1 = dpt.arange(10, dtype="i4")
+        sorter = dpt.arange(10, dtype="i4")
+    except dpctl.SyclDeviceCreationError:
+        pytest.skip("Default device could not be created")
+    d = x1.sycl_device
+    q2 = dpctl.SyclQueue(d, property="in_order")
+    x2 = dpt.ones(5, dtype=x1.dtype, sycl_queue=q2)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.searchsorted(x1, x2)
+
+    with pytest.raises(dpt.ExecutionPlacementError):
+        dpt.searchsorted(x1, x2, sorter=sorter)
+
+    sorter = dpt.ones(x1.shape, dtype=dpt.bool)
+    # non-integral sorter.dtype raises
+    with pytest.raises(ValueError):
+        dpt.searchsorted(x1, x1, sorter=sorter)
+
+    # non-matching x1.shape and sorter.shape raises
+    with pytest.raises(ValueError):
+        dpt.searchsorted(x1, x1, sorter=sorter[:-1])
+
+    # x1 must be 1d, or ValueError is raised
+    with pytest.raises(ValueError):
+        dpt.searchsorted(x1[dpt.newaxis, :], x1)
+
+
+def test_pw_linear_interpolation_example():
+    get_queue_or_skip()
+
+    bins = dpt.asarray([0.0, 0.05, 0.2, 0.25, 0.5, 0.8, 0.95, 1])
+    vals = dpt.asarray([0.1, 0.15, 0.3, 0.5, 0.7, 0.53, 0.37, 0.1])
+    assert vals.shape == bins.shape
+    data_np = np.random.uniform(0, 1, size=10000)
+    data = dpt.asarray(data_np)
+
+    p = dpt.searchsorted(bins, data)
+    w = (data - bins[p]) / (bins[p - 1] - bins[p])
+    assert dpt.min(w) >= 0
+    assert dpt.max(w) <= 1
+    interp_vals = vals[p - 1] * w + (1 - w) * vals[p]
+
+    assert interp_vals.shape == data.shape
+    assert dpt.min(interp_vals) >= dpt.zeros(tuple())
+    av = dpt.sum(interp_vals) / data.size
+    exp = dpt.vecdot(vals[1:] + vals[:-1], bins[1:] - bins[:-1]) / 2
+
+    assert dpt.abs(av - exp) < 0.1
+
+
+def test_out_of_bound_sorter_values():
+    get_queue_or_skip()
+
+    x = dpt.asarray([1, 2, 0], dtype="i4")
+    n = x.shape[0]
+
+    # use out-of-bounds indices in sorter
+    sorter = dpt.asarray([2, 0 - n, 1 - n], dtype="i8")
+
+    x2 = dpt.arange(3, dtype=x.dtype)
+    p = dpt.searchsorted(x, x2, sorter=sorter)
+    # verify that they were applied with mode="wrap"
+    assert dpt.all(p == dpt.arange(3, dtype=p.dtype))
+
+
+def test_searchsorted_strided_scalar_needle():
+    get_queue_or_skip()
+
+    a_max = 255
+
+    hay_stack = dpt.flip(
+        dpt.repeat(dpt.arange(a_max - 1, -1, -1, dtype=dpt.int32), 4)
+    )
+    needles_np = np.squeeze(
+        np.random.randint(0, a_max, dtype=dpt.int32, size=1), axis=0
+    )
+    needles = dpt.asarray(needles_np)
+
+    _check(hay_stack, needles, needles_np)
diff --git a/dpnp/tests/tensor/test_usm_ndarray_sorting.py b/dpnp/tests/tensor/test_usm_ndarray_sorting.py
new file mode 100644
index 000000000000..af96811bf2f9
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_sorting.py
@@ -0,0 +1,397 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_sort_1d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    inp = dpt.roll(
+        dpt.concat(
+            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
+        ),
+        734,
+    )
+
+    s = dpt.sort(inp, descending=False)
+    assert dpt.all(s[:-1] <= s[1:])
+
+    s1 = dpt.sort(inp, descending=True)
+    assert dpt.all(s1[:-1] >= s1[1:])
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_sort_2d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    fl = dpt.roll(
+        dpt.concat(
+            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
+        ),
+        734,
+    )
+    inp = dpt.reshape(fl, (20, -1))
+
+    s = dpt.sort(inp, axis=1, descending=False)
+    assert dpt.all(s[:, :-1] <= s[:, 1:])
+
+    s1 = dpt.sort(inp, axis=1, descending=True)
+    assert dpt.all(s1[:, :-1] >= s1[:, 1:])
+
+
+def test_sort_strides():
+    get_queue_or_skip()
+
+    fl = dpt.roll(
+        dpt.concat((dpt.ones(10000, dtype="i4"), dpt.zeros(10000, dtype="i4"))),
+        734,
+    )
+    inp = dpt.reshape(fl, (-1, 20))
+
+    s = dpt.sort(inp, axis=0, descending=False)
+    assert dpt.all(s[:-1, :] <= s[1:, :])
+
+    s1 = dpt.sort(inp, axis=0, descending=True)
+    assert dpt.all(s1[:-1, :] >= s1[1:, :])
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_argsort_1d(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    inp = dpt.roll(
+        dpt.concat(
+            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
+        ),
+        734,
+    )
+
+    s_idx = dpt.argsort(inp, descending=False)
+    assert dpt.all(inp[s_idx[:-1]] <= inp[s_idx[1:]])
+
+    s1_idx = dpt.argsort(inp, descending=True)
+    assert dpt.all(inp[s1_idx[:-1]] >= inp[s1_idx[1:]])
+
+
+def test_sort_validation():
+    with pytest.raises(TypeError):
+        dpt.sort(dict())
+
+
+def test_sort_validation_kind():
+    get_queue_or_skip()
+
+    x = dpt.ones(128, dtype="u1")
+
+    with pytest.raises(ValueError):
+        dpt.sort(x, kind=Ellipsis)
+
+    with pytest.raises(ValueError):
+        dpt.sort(x, kind="invalid")
+
+
+def test_argsort_validation():
+    with pytest.raises(TypeError):
+        dpt.argsort(dict())
+
+
+def test_argsort_validation_kind():
+    get_queue_or_skip()
+
+    x = dpt.arange(127, stop=0, step=-1, dtype="i1")
+
+    with pytest.raises(ValueError):
+        dpt.argsort(x, kind=Ellipsis)
+
+    with pytest.raises(ValueError):
+        dpt.argsort(x, kind="invalid")
+
+
+_all_kinds = ["stable", "mergesort", "radixsort"]
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_sort_axis0(kind):
+    get_queue_or_skip()
+
+    n, m = 200, 30
+    xf = dpt.arange(n * m, 0, step=-1, dtype="i4")
+    x = dpt.reshape(xf, (n, m))
+    s = dpt.sort(x, axis=0, kind=kind)
+
+    assert dpt.all(s[:-1, :] <= s[1:, :])
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_argsort_axis0(kind):
+    get_queue_or_skip()
+
+    n, m = 200, 30
+    xf = dpt.arange(n * m, 0, step=-1, dtype="i4")
+    x = dpt.reshape(xf, (n, m))
+    idx = dpt.argsort(x, axis=0, kind=kind)
+
+    s = dpt.take_along_axis(x, idx, axis=0)
+
+    assert dpt.all(s[:-1, :] <= s[1:, :])
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_argsort_axis1(kind):
+    get_queue_or_skip()
+
+    n, m = 200, 30
+    xf = dpt.arange(n * m, 0, step=-1, dtype="i4")
+    x = dpt.reshape(xf, (n, m))
+    idx = dpt.argsort(x, axis=1, kind=kind)
+
+    s = dpt.take_along_axis(x, idx, axis=1)
+
+    assert dpt.all(s[:, :-1] <= s[:, 1:])
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_sort_strided(kind):
+    get_queue_or_skip()
+
+    x_orig = dpt.arange(100, dtype="i4")
+    x_flipped = dpt.flip(x_orig, axis=0)
+    s = dpt.sort(x_flipped, kind=kind)
+
+    assert dpt.all(s == x_orig)
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_argsort_strided(kind):
+    get_queue_or_skip()
+
+    x_orig = dpt.arange(100, dtype="i4")
+    x_flipped = dpt.flip(x_orig, axis=0)
+    idx = dpt.argsort(x_flipped, kind=kind)
+    s = dpt.take_along_axis(x_flipped, idx, axis=0)
+
+    assert dpt.all(s == x_orig)
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_sort_0d_array(kind):
+    get_queue_or_skip()
+
+    x = dpt.asarray(1, dtype="i4")
+    expected = dpt.asarray(1, dtype="i4")
+    assert dpt.sort(x, kind=kind) == expected
+
+
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_argsort_0d_array(kind):
+    get_queue_or_skip()
+
+    x = dpt.asarray(1, dtype="i4")
+    expected = dpt.asarray(0, dtype="i4")
+    assert dpt.argsort(x, kind=kind) == expected
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "f2",
+        "f4",
+        "f8",
+    ],
+)
+@pytest.mark.parametrize("kind", _all_kinds)
+def test_sort_real_fp_nan(dtype, kind):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(
+        [-0.0, 0.1, dpt.nan, 0.0, -0.1, dpt.nan, 0.2, -0.3], dtype=dtype
+    )
+    s = dpt.sort(x, kind=kind)
+
+    expected = dpt.asarray(
+        [-0.3, -0.1, -0.0, 0.0, 0.1, 0.2, dpt.nan, dpt.nan], dtype=dtype
+    )
+
+    assert dpt.allclose(s, expected, equal_nan=True)
+
+    s = dpt.sort(x, descending=True, kind=kind)
+
+    expected = dpt.asarray(
+        [dpt.nan, dpt.nan, 0.2, 0.1, -0.0, 0.0, -0.1, -0.3], dtype=dtype
+    )
+
+    assert dpt.allclose(s, expected, equal_nan=True)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "c8",
+        "c16",
+    ],
+)
+def test_sort_complex_fp_nan(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    rvs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan]
+    ivs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan]
+
+    cv = []
+    for rv in rvs:
+        for iv in ivs:
+            cv.append(complex(rv, iv))
+
+    inp = dpt.asarray(cv, dtype=dtype)
+    s = dpt.sort(inp)
+
+    expected = np.sort(dpt.asnumpy(inp))
+
+    assert np.allclose(dpt.asnumpy(s), expected, equal_nan=True)
+
+    pairs = []
+    for i, j in itertools.permutations(range(inp.shape[0]), 2):
+        pairs.append([i, j])
+    sub_arrs = inp[dpt.asarray(pairs)]
+    m1 = dpt.asnumpy(dpt.sort(sub_arrs, axis=1))
+    m2 = np.sort(dpt.asnumpy(sub_arrs), axis=1)
+    for k in range(len(pairs)):
+        i, j = pairs[k]
+        r1 = m1[k]
+        r2 = m2[k]
+        assert np.array_equal(
+            r1.view(np.int64), r2.view(np.int64)
+        ), f"Failed for {i} and {j}"
+
+
+def test_radix_sort_size_1_axis():
+    get_queue_or_skip()
+
+    x1 = dpt.ones((), dtype="i1")
+    r1 = dpt.sort(x1, kind="radixsort")
+    assert_array_equal(dpt.asnumpy(r1), dpt.asnumpy(x1))
+
+    x2 = dpt.ones([1], dtype="i1")
+    r2 = dpt.sort(x2, kind="radixsort")
+    assert_array_equal(dpt.asnumpy(r2), dpt.asnumpy(x2))
+
+    x3 = dpt.reshape(dpt.arange(10, dtype="i1"), (10, 1))
+    r3 = dpt.sort(x3, kind="radixsort")
+    assert dpt.asnumpy(r3 == x3).all()
+
+    x4 = dpt.reshape(dpt.arange(10, dtype="i1"), (1, 10))
+    r4 = dpt.sort(x4, axis=0, kind="radixsort")
+    assert dpt.asnumpy(r4 == x4).all()
+
+
+def test_radix_argsort_size_1_axis():
+    get_queue_or_skip()
+
+    x1 = dpt.ones((), dtype="i1")
+    r1 = dpt.argsort(x1, kind="radixsort")
+    assert r1 == 0
+
+    x2 = dpt.ones([1], dtype="i1")
+    r2 = dpt.argsort(x2, kind="radixsort")
+    assert dpt.asnumpy(r2 == 0).all()
+
+    x3 = dpt.reshape(dpt.arange(10, dtype="i1"), (10, 1))
+    r3 = dpt.argsort(x3, kind="radixsort")
+    assert dpt.asnumpy(r3 == 0).all()
+
+    x4 = dpt.reshape(dpt.arange(10, dtype="i1"), (1, 10))
+    r4 = dpt.argsort(x4, axis=0, kind="radixsort")
+    assert dpt.asnumpy(r4 == 0).all()
diff --git a/dpnp/tests/tensor/test_usm_ndarray_top_k.py b/dpnp/tests/tensor/test_usm_ndarray_top_k.py
new file mode 100644
index 000000000000..1c04c1fff57a
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_top_k.py
@@ -0,0 +1,331 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+def _expected_largest_inds(inp, n, shift, k):
+    "Computed expected top_k indices for mode='largest'"
+    assert k < n
+    ones_start_id = shift % (2 * n)
+
+    alloc_dev = inp.device
+
+    if ones_start_id < n:
+        expected_inds = dpt.arange(
+            ones_start_id, ones_start_id + k, dtype="i8", device=alloc_dev
+        )
+    else:
+        # wrap-around
+        ones_end_id = (ones_start_id + n) % (2 * n)
+        if ones_end_id >= k:
+            expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev)
+        else:
+            expected_inds = dpt.concat(
+                (
+                    dpt.arange(ones_end_id, dtype="i8", device=alloc_dev),
+                    dpt.arange(
+                        ones_start_id,
+                        ones_start_id + k - ones_end_id,
+                        dtype="i8",
+                        device=alloc_dev,
+                    ),
+                )
+            )
+
+    return expected_inds
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [33, 43, 255, 511, 1021, 8193])
+def test_top_k_1d_largest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    shift, k = 734, 5
+    o = dpt.ones(n, dtype=dtype)
+    z = dpt.zeros(n, dtype=dtype)
+    oz = dpt.concat((o, z))
+    inp = dpt.roll(oz, shift)
+
+    expected_inds = _expected_largest_inds(oz, n, shift, k)
+
+    s = dpt.top_k(inp, k, mode="largest")
+    assert s.values.shape == (k,)
+    assert s.values.dtype == inp.dtype
+    assert s.indices.shape == (k,)
+    assert dpt.all(s.values == dpt.ones(k, dtype=dtype)), s.values
+    assert dpt.all(s.values == inp[s.indices]), s.indices
+    assert dpt.all(s.indices == expected_inds), (s.indices, expected_inds)
+
+
+def _expected_smallest_inds(inp, n, shift, k):
+    "Computed expected top_k indices for mode='smallest'"
+    assert k < n
+    zeros_start_id = (n + shift) % (2 * n)
+    zeros_end_id = (shift) % (2 * n)
+
+    alloc_dev = inp.device
+
+    if zeros_start_id < zeros_end_id:
+        expected_inds = dpt.arange(
+            zeros_start_id, zeros_start_id + k, dtype="i8", device=alloc_dev
+        )
+    else:
+        if zeros_end_id >= k:
+            expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev)
+        else:
+            expected_inds = dpt.concat(
+                (
+                    dpt.arange(zeros_end_id, dtype="i8", device=alloc_dev),
+                    dpt.arange(
+                        zeros_start_id,
+                        zeros_start_id + k - zeros_end_id,
+                        dtype="i8",
+                        device=alloc_dev,
+                    ),
+                )
+            )
+
+    return expected_inds
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193])
+def test_top_k_1d_smallest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    shift, k = 734, 5
+    o = dpt.ones(n, dtype=dtype)
+    z = dpt.zeros(n, dtype=dtype)
+    oz = dpt.concat((o, z))
+    inp = dpt.roll(oz, shift)
+
+    expected_inds = _expected_smallest_inds(oz, n, shift, k)
+
+    s = dpt.top_k(inp, k, mode="smallest")
+    assert s.values.shape == (k,)
+    assert s.values.dtype == inp.dtype
+    assert s.indices.shape == (k,)
+    assert dpt.all(s.values == dpt.zeros(k, dtype=dtype)), s.values
+    assert dpt.all(s.values == inp[s.indices]), s.indices
+    assert dpt.all(s.indices == expected_inds), (s.indices, expected_inds)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        # skip short types to ensure that m*n can be represented
+        # in the type
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193])
+def test_top_k_2d_largest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m, k = 8, 3
+    if dtype == "f2" and m * n > 2000:
+        pytest.skip(
+            "f2 can not distinguish between large integers used in this test"
+        )
+
+    x = dpt.reshape(dpt.arange(m * n, dtype=dtype), (m, n))
+
+    r = dpt.top_k(x, k, axis=1)
+
+    assert r.values.shape == (m, k)
+    assert r.indices.shape == (m, k)
+    expected_inds = dpt.reshape(dpt.arange(n, dtype=r.indices.dtype), (1, n))[
+        :, -k:
+    ]
+    assert expected_inds.shape == (1, k)
+    assert dpt.all(
+        dpt.sort(r.indices, axis=1) == dpt.sort(expected_inds, axis=1)
+    ), (r.indices, expected_inds)
+    expected_vals = x[:, -k:]
+    assert dpt.all(
+        dpt.sort(r.values, axis=1) == dpt.sort(expected_vals, axis=1)
+    )
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        # skip short types to ensure that m*n can be represented
+        # in the type
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193])
+def test_top_k_2d_smallest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m, k = 8, 3
+    if dtype == "f2" and m * n > 2000:
+        pytest.skip(
+            "f2 can not distinguish between large integers used in this test"
+        )
+
+    x = dpt.reshape(dpt.arange(m * n, dtype=dtype), (m, n))
+
+    r = dpt.top_k(x, k, axis=1, mode="smallest")
+
+    assert r.values.shape == (m, k)
+    assert r.indices.shape == (m, k)
+    expected_inds = dpt.reshape(dpt.arange(n, dtype=r.indices.dtype), (1, n))[
+        :, :k
+    ]
+    assert dpt.all(
+        dpt.sort(r.indices, axis=1) == dpt.sort(expected_inds, axis=1)
+    )
+    assert dpt.all(dpt.sort(r.values, axis=1) == dpt.sort(x[:, :k], axis=1))
+
+
+def test_top_k_0d():
+    get_queue_or_skip()
+
+    a = dpt.ones(tuple(), dtype="i4")
+    assert a.ndim == 0
+    assert a.size == 1
+
+    r = dpt.top_k(a, 1)
+    assert r.values == a
+    assert r.indices == dpt.zeros_like(a, dtype=r.indices.dtype)
+
+
+def test_top_k_noncontig():
+    get_queue_or_skip()
+
+    a = dpt.arange(256, dtype=dpt.int32)[::2]
+    r = dpt.top_k(a, 3)
+
+    assert dpt.all(dpt.sort(r.values) == dpt.asarray([250, 252, 254])), r.values
+    assert dpt.all(
+        dpt.sort(r.indices) == dpt.asarray([125, 126, 127])
+    ), r.indices
+
+
+def test_top_k_axis0():
+    get_queue_or_skip()
+
+    m, n, k = 128, 8, 3
+    x = dpt.reshape(dpt.arange(m * n, dtype=dpt.int32), (m, n))
+
+    r = dpt.top_k(x, k, axis=0, mode="smallest")
+    assert r.values.shape == (k, n)
+    assert r.indices.shape == (k, n)
+    expected_inds = dpt.reshape(dpt.arange(m, dtype=r.indices.dtype), (m, 1))[
+        :k, :
+    ]
+    assert dpt.all(
+        dpt.sort(r.indices, axis=0) == dpt.sort(expected_inds, axis=0)
+    )
+    assert dpt.all(dpt.sort(r.values, axis=0) == dpt.sort(x[:k, :], axis=0))
+
+
+def test_top_k_validation():
+    get_queue_or_skip()
+    x = dpt.ones(10, dtype=dpt.int64)
+    with pytest.raises(ValueError):
+        # k must be positive
+        dpt.top_k(x, -1)
+    with pytest.raises(TypeError):
+        # argument should be usm_ndarray
+        dpt.top_k(list(), 2)
+    x2 = dpt.reshape(x, (2, 5))
+    with pytest.raises(ValueError):
+        # k must not exceed array dimension
+        # along specified axis
+        dpt.top_k(x2, 100, axis=1)
+    with pytest.raises(ValueError):
+        # for 0d arrays, k must be 1
+        dpt.top_k(x[0], 2)
+    with pytest.raises(ValueError):
+        # mode must be "largest", or "smallest"
+        dpt.top_k(x, 2, mode="invalid")
diff --git a/dpnp/tests/tensor/test_usm_ndarray_unique.py b/dpnp/tests/tensor/test_usm_ndarray_unique.py
new file mode 100644
index 000000000000..d602c0346f5d
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_unique.py
@@ -0,0 +1,361 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+import pytest
+
+import dpnp.tensor as dpt
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_values(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv = dpt.unique_values(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+
+
+def test_unique_values_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv = dpt.unique_values(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+
+    inp = dpt.reshape(inp, -1)
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv = dpt.unique_values(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_counts(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv, uv_counts = dpt.unique_counts(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+    assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))
+
+
+def test_unique_counts_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv, uv_counts = dpt.unique_counts(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv, uv_counts = dpt.unique_counts(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_inverse(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv, inv = dpt.unique_inverse(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+
+
+def test_unique_inverse_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv, inv = dpt.unique_inverse(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv, inv = dpt.unique_inverse(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+def test_unique_all(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n, roll = 10000, 734
+    inp = dpt.roll(
+        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
+        roll,
+    )
+
+    uv, ind, inv, uv_counts = dpt.unique_all(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
+    assert dpt.all(uv == inp[ind])
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+    assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))
+
+
+def test_unique_all_strided():
+    get_queue_or_skip()
+
+    n, m = 1000, 20
+    inp = dpt.ones((n, m), dtype="i4", order="F")
+    inp[:, ::2] = 0
+
+    uv, ind, inv, uv_counts = dpt.unique_all(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv == dpt.reshape(inp, -1)[ind])
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+    inp = dpt.flip(dpt.reshape(inp, -1))
+
+    uv, ind, inv, uv_counts = dpt.unique_all(inp)
+    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
+    assert dpt.all(uv == inp[ind])
+    assert dpt.all(inp == uv[inv])
+    assert inp.shape == inv.shape
+    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
+
+
+def test_set_functions_empty_input():
+    get_queue_or_skip()
+    x = dpt.ones((10, 0, 1), dtype="i4")
+
+    res = dpt.unique_values(x)
+    assert isinstance(res, dpt.usm_ndarray)
+    assert res.size == 0
+    assert res.dtype == x.dtype
+
+    res = dpt.unique_inverse(x)
+    assert type(res).__name__ == "UniqueInverseResult"
+    uv, inv = res
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert uv.size == 0
+    assert isinstance(inv, dpt.usm_ndarray)
+    assert inv.size == 0
+
+    res = dpt.unique_counts(x)
+    assert type(res).__name__ == "UniqueCountsResult"
+    uv, uv_counts = res
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert uv.size == 0
+    assert isinstance(uv_counts, dpt.usm_ndarray)
+    assert uv_counts.size == 0
+
+    res = dpt.unique_all(x)
+    assert type(res).__name__ == "UniqueAllResult"
+    uv, ind, inv, uv_counts = res
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert uv.size == 0
+    assert isinstance(ind, dpt.usm_ndarray)
+    assert ind.size == 0
+    assert isinstance(inv, dpt.usm_ndarray)
+    assert inv.size == 0
+    assert isinstance(uv_counts, dpt.usm_ndarray)
+    assert uv_counts.size == 0
+
+
+def test_set_function_outputs():
+    get_queue_or_skip()
+    # check standard and early exit paths
+    x1 = dpt.arange(10, dtype="i4")
+    x2 = dpt.ones((10, 10), dtype="i4")
+
+    assert isinstance(dpt.unique_values(x1), dpt.usm_ndarray)
+    assert isinstance(dpt.unique_values(x2), dpt.usm_ndarray)
+
+    assert type(dpt.unique_inverse(x1)).__name__ == "UniqueInverseResult"
+    assert type(dpt.unique_inverse(x2)).__name__ == "UniqueInverseResult"
+
+    assert type(dpt.unique_counts(x1)).__name__ == "UniqueCountsResult"
+    assert type(dpt.unique_counts(x2)).__name__ == "UniqueCountsResult"
+
+    assert type(dpt.unique_all(x1)).__name__ == "UniqueAllResult"
+    assert type(dpt.unique_all(x2)).__name__ == "UniqueAllResult"
+
+
+def test_set_functions_compute_follows_data():
+    # tests that all intermediate calls and allocations
+    # are compatible with an input with an arbitrary queue
+    get_queue_or_skip()
+    q = dpctl.SyclQueue()
+    x = dpt.arange(10, dtype="i4", sycl_queue=q)
+
+    uv = dpt.unique_values(x)
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert uv.sycl_queue == q
+    uv, uc = dpt.unique_counts(x)
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert isinstance(uc, dpt.usm_ndarray)
+    assert uv.sycl_queue == q
+    assert uc.sycl_queue == q
+    uv, inv_ind = dpt.unique_inverse(x)
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert isinstance(inv_ind, dpt.usm_ndarray)
+    assert uv.sycl_queue == q
+    assert inv_ind.sycl_queue == q
+    uv, ind, inv_ind, uc = dpt.unique_all(x)
+    assert isinstance(uv, dpt.usm_ndarray)
+    assert isinstance(ind, dpt.usm_ndarray)
+    assert isinstance(inv_ind, dpt.usm_ndarray)
+    assert isinstance(uc, dpt.usm_ndarray)
+    assert uv.sycl_queue == q
+    assert ind.sycl_queue == q
+    assert inv_ind.sycl_queue == q
+    assert uc.sycl_queue == q
+
+
+def test_gh_1738():
+    get_queue_or_skip()
+
+    ones = dpt.ones(10, dtype="i8")
+    iota = dpt.arange(10, dtype="i8")
+
+    assert ones.device == iota.device
+
+    dpt_info = dpt.__array_namespace_info__()
+    ind_dt = dpt_info.default_dtypes(device=ones.device)["indexing"]
+
+    dt = dpt.unique_inverse(ones).inverse_indices.dtype
+    assert dt == ind_dt
+    dt = dpt.unique_all(ones).inverse_indices.dtype
+    assert dt == ind_dt
+
+    dt = dpt.unique_inverse(iota).inverse_indices.dtype
+    assert dt == ind_dt
+    dt = dpt.unique_all(iota).inverse_indices.dtype
+    assert dt == ind_dt
diff --git a/dpnp/tests/tensor/test_usm_ndarray_utility_functions.py b/dpnp/tests/tensor/test_usm_ndarray_utility_functions.py
new file mode 100644
index 000000000000..b6d6293ade73
--- /dev/null
+++ b/dpnp/tests/tensor/test_usm_ndarray_utility_functions.py
@@ -0,0 +1,199 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from random import randrange
+
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal, assert_equal
+
+import dpnp.tensor as dpt
+from dpnp.tensor._numpy_helper import AxisError
+
+from .helper import (
+    get_queue_or_skip,
+    skip_if_dtype_not_supported,
+)
+
+_all_dtypes = [
+    "?",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)])
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_boolean_reduction_dtypes_contig(func, identity, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.full(10, identity, dtype=dtype, sycl_queue=q)
+    res = func(x)
+
+    assert_equal(dpt.asnumpy(res), identity)
+
+    x[randrange(x.size)] = not identity
+    res = func(x)
+    assert_equal(dpt.asnumpy(res), not identity)
+
+    # test branch in kernel for large arrays
+    wg_size = 4 * 32
+    x = dpt.full((wg_size + 1), identity, dtype=dtype, sycl_queue=q)
+    res = func(x)
+    assert_equal(dpt.asnumpy(res), identity)
+
+    x[randrange(x.size)] = not identity
+    res = func(x)
+    assert_equal(dpt.asnumpy(res), not identity)
+
+
+@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)])
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_boolean_reduction_dtypes_strided(func, identity, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.full(20, identity, dtype=dtype, sycl_queue=q)[::-2]
+    res = func(x)
+    assert_equal(dpt.asnumpy(res), identity)
+
+    x[randrange(x.size)] = not identity
+    res = func(x)
+    assert_equal(dpt.asnumpy(res), not identity)
+
+
+@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)])
+def test_boolean_reduction_axis(func, identity):
+    get_queue_or_skip()
+
+    x = dpt.full((2, 3, 4, 5, 6), identity, dtype="i4")
+    res = func(x, axis=(1, 2, -1))
+
+    assert res.shape == (2, 5)
+    assert_array_equal(dpt.asnumpy(res), np.full(res.shape, identity))
+
+    # make first row of output negation of identity
+    x[0, 0, 0, ...] = not identity
+    res = func(x, axis=(1, 2, -1))
+    assert_array_equal(dpt.asnumpy(res[0]), np.full(res.shape[1], not identity))
+
+
+@pytest.mark.parametrize("func", [dpt.all, dpt.any])
+def test_boolean_reduction_keepdims(func):
+    get_queue_or_skip()
+
+    x = dpt.ones((2, 3, 4, 5, 6), dtype="i4")
+    res = func(x, axis=(1, 2, -1), keepdims=True)
+    assert res.shape == (2, 1, 1, 5, 1)
+    assert_array_equal(dpt.asnumpy(res), np.full(res.shape, True))
+
+    res = func(x, axis=None, keepdims=True)
+    assert res.shape == (1,) * x.ndim
+
+
+@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)])
+def test_boolean_reduction_empty(func, identity):
+    get_queue_or_skip()
+
+    x = dpt.empty((0,), dtype="i4")
+    res = func(x)
+    assert_equal(dpt.asnumpy(res), identity)
+
+
+# nan, inf, and -inf should evaluate to true
+@pytest.mark.parametrize("func", [dpt.all, dpt.any])
+def test_boolean_reductions_nan_inf(func):
+    q = get_queue_or_skip()
+
+    x = dpt.asarray([dpt.nan, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q)[
+        :, dpt.newaxis
+    ]
+    res = func(x, axis=1)
+    assert_array_equal(dpt.asnumpy(res), np.array([True, True, True]))
+
+
+@pytest.mark.parametrize("func", [dpt.all, dpt.any])
+def test_boolean_reduction_scalars(func):
+    get_queue_or_skip()
+
+    x = dpt.ones((), dtype="i4")
+    assert_equal(dpt.asnumpy(func(x)), True)
+
+    x = dpt.zeros((), dtype="i4")
+    assert_equal(dpt.asnumpy(func(x)), False)
+
+
+@pytest.mark.parametrize("func", [dpt.all, dpt.any])
+def test_boolean_reduction_empty_axis(func):
+    get_queue_or_skip()
+
+    x = dpt.ones((5,), dtype="i4")
+    res = func(x, axis=())
+    assert_array_equal(dpt.asnumpy(res), dpt.asnumpy(x).astype(np.bool_))
+
+
+@pytest.mark.parametrize("func", [dpt.all, dpt.any])
+def test_arg_validation_boolean_reductions(func):
+    get_queue_or_skip()
+
+    x = dpt.ones((4, 5), dtype="i4")
+    d = {}
+
+    with pytest.raises(TypeError):
+        func(d)
+    with pytest.raises(AxisError):
+        func(x, axis=-3)
+
+
+def test_boolean_reductions_3d_gh_1327():
+    get_queue_or_skip()
+
+    size = 24
+    x = dpt.reshape(dpt.arange(-10, size - 10, 1, dtype="i4"), (2, 3, 4))
+    res = dpt.all(x, axis=0)
+    res_np = np.full(res.shape, True, dtype="?")
+    res_np[2, 2] = False
+
+    assert (dpt.asnumpy(res) == res_np).all()
+
+    x = dpt.ones((2, 3, 4, 5), dtype="i4")
+    res = dpt.any(x, axis=0)
+
+    assert (dpt.asnumpy(res) == np.full(res.shape, True, dtype="?")).all()
diff --git a/dpnp/tests/test_array_api_info.py b/dpnp/tests/test_array_api_info.py
index 0e2fe7dc5a04..9b1f0cc20108 100644
--- a/dpnp/tests/test_array_api_info.py
+++ b/dpnp/tests/test_array_api_info.py
@@ -1,10 +1,9 @@
-import numpy
 import pytest
 from dpctl import get_devices, select_default_device
-from dpctl.tensor._tensor_impl import default_device_complex_type
 
 import dpnp
 from dpnp.exceptions import SyclDeviceCreationError
+from dpnp.tensor._tensor_impl import default_device_complex_type
 from dpnp.tests.helper import (
     has_support_aspect64,
     is_win_platform,
diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py
index d8a80ddbff78..b195c0484105 100644
--- a/dpnp/tests/test_arraycreation.py
+++ b/dpnp/tests/test_arraycreation.py
@@ -2,7 +2,6 @@
 from math import prod
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import (
@@ -14,6 +13,7 @@
 )
 
 import dpnp
+import dpnp.tensor as dpt
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_arraymanipulation.py b/dpnp/tests/test_arraymanipulation.py
index fe74368a8c81..25c454b97613 100644
--- a/dpnp/tests/test_arraymanipulation.py
+++ b/dpnp/tests/test_arraymanipulation.py
@@ -1,11 +1,9 @@
-import warnings
-
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import assert_array_equal, assert_equal, assert_raises
 
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.exceptions import AxisError
 
 from .helper import get_all_dtypes, get_float_complex_dtypes
diff --git a/dpnp/tests/test_cli_options.py b/dpnp/tests/test_cli_options.py
new file mode 100644
index 000000000000..0caca95f3974
--- /dev/null
+++ b/dpnp/tests/test_cli_options.py
@@ -0,0 +1,20 @@
+import subprocess
+import sys
+
+
+def test_tensor_includes():
+    res = subprocess.run(
+        [sys.executable, "-m", "dpnp", "--tensor-includes"],
+        capture_output=True,
+    )
+    assert res.returncode == 0
+    assert res.stdout
+    flags = res.stdout.decode("utf-8")
+    res = subprocess.run(
+        [sys.executable, "-m", "dpnp", "--tensor-include-dir"],
+        capture_output=True,
+    )
+    assert res.returncode == 0
+    assert res.stdout
+    dir = res.stdout.decode("utf-8")
+    assert flags == "-I " + dir
diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py
index b10bf1b46016..f8cc95a7a3ca 100644
--- a/dpnp/tests/test_fft.py
+++ b/dpnp/tests/test_fft.py
@@ -1,10 +1,10 @@
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import assert_raises
 
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.dpnp_utils import map_dtype_to_device
 from dpnp.exceptions import ExecutionPlacementError
 
diff --git a/dpnp/tests/test_indexing.py b/dpnp/tests/test_indexing.py
index 27f34f6288b3..d54ae381a386 100644
--- a/dpnp/tests/test_indexing.py
+++ b/dpnp/tests/test_indexing.py
@@ -1,10 +1,8 @@
 import functools
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.tensor._type_utils import _to_device_supported_dtype
 from numpy.testing import (
     assert_,
     assert_array_equal,
@@ -14,8 +12,10 @@
 )
 
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.dpnp_array import dpnp_array
 from dpnp.exceptions import AxisError, ExecutionPlacementError
+from dpnp.tensor._type_utils import _to_device_supported_dtype
 
 from .helper import (
     generate_random_numpy_array,
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
index 20d974b32f0c..74b9122e0d20 100644
--- a/dpnp/tests/test_linalg.py
+++ b/dpnp/tests/test_linalg.py
@@ -1,7 +1,6 @@
 import warnings
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import (
@@ -13,6 +12,7 @@
 )
 
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.exceptions import AxisError, ExecutionPlacementError
 
 from .helper import (
diff --git a/dpnp/tests/test_manipulation.py b/dpnp/tests/test_manipulation.py
index c35050afaa86..4fc4b8cb1619 100644
--- a/dpnp/tests/test_manipulation.py
+++ b/dpnp/tests/test_manipulation.py
@@ -1,6 +1,5 @@
 import itertools
 
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import (
@@ -10,6 +9,7 @@
 )
 
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.exceptions import AxisError
 
 from .helper import (
diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py
index 511047372a14..8de7ec2ed80d 100644
--- a/dpnp/tests/test_mathematical.py
+++ b/dpnp/tests/test_mathematical.py
@@ -1,8 +1,6 @@
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
-from dpctl.tensor._numpy_helper import normalize_axis_index
 from numpy.testing import (
     assert_allclose,
     assert_array_equal,
@@ -12,9 +10,11 @@
 )
 
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import map_dtype_to_device
 from dpnp.exceptions import AxisError, ExecutionPlacementError
+from dpnp.tensor._numpy_helper import normalize_axis_index
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_memory.py b/dpnp/tests/test_memory.py
index 6a3d6ac5afae..5f5a9251fd75 100644
--- a/dpnp/tests/test_memory.py
+++ b/dpnp/tests/test_memory.py
@@ -1,9 +1,9 @@
-import dpctl.tensor as dpt
 import numpy
 import pytest
 
 import dpnp
 import dpnp.memory as dpm
+import dpnp.tensor as dpt
 
 
 class IntUsmData(dpt.usm_ndarray):
diff --git a/dpnp/tests/test_nanfunctions.py b/dpnp/tests/test_nanfunctions.py
index 48520015d354..598d1c2678ec 100644
--- a/dpnp/tests/test_nanfunctions.py
+++ b/dpnp/tests/test_nanfunctions.py
@@ -1,5 +1,4 @@
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import (
@@ -12,6 +11,7 @@
 )
 
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.exceptions import ExecutionPlacementError
 
 from .helper import (
diff --git a/dpnp/tests/test_ndarray.py b/dpnp/tests/test_ndarray.py
index 6ce8645a11d4..5a848c9660fc 100644
--- a/dpnp/tests/test_ndarray.py
+++ b/dpnp/tests/test_ndarray.py
@@ -1,4 +1,3 @@
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import (
@@ -10,6 +9,7 @@
 )
 
 import dpnp
+import dpnp.tensor as dpt
 
 from .helper import (
     generate_random_numpy_array,
@@ -567,6 +567,9 @@ def test_print_dpnp_special_character(character):
     assert result == expected
 
 
+# TODO: repr formatting is inconsistent (scientific vs integer-like output)
+# This is a minor issue that does not depend on compiler flags
+@pytest.mark.skip(reason="SAT-8452")
 def test_print_dpnp_1d():
     dtype = dpnp.default_float_type()
     result = repr(dpnp.arange(10000, dtype=dtype))
diff --git a/dpnp/tests/test_search.py b/dpnp/tests/test_search.py
index 64c4eb75f906..75ce9bdeed20 100644
--- a/dpnp/tests/test_search.py
+++ b/dpnp/tests/test_search.py
@@ -1,9 +1,9 @@
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import assert_array_equal, assert_equal, assert_raises
 
 import dpnp
+import dpnp.tensor as dpt
 
 from .helper import (
     generate_random_numpy_array,
diff --git a/dpnp/tests/test_statistics.py b/dpnp/tests/test_statistics.py
index cf436087b607..a02adfac2ecb 100644
--- a/dpnp/tests/test_statistics.py
+++ b/dpnp/tests/test_statistics.py
@@ -1,5 +1,4 @@
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import (
@@ -10,6 +9,7 @@
 )
 
 import dpnp
+import dpnp.tensor as dpt
 
 from .helper import (
     assert_dtype_allclose,
diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py
index e4b9403df8a4..5420285d5940 100644
--- a/dpnp/tests/test_sycl_queue.py
+++ b/dpnp/tests/test_sycl_queue.py
@@ -2,13 +2,13 @@
 import tempfile
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import assert_array_equal, assert_raises
 
 import dpnp
 import dpnp.linalg
+import dpnp.tensor as dpt
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations
 from dpnp.exceptions import ExecutionPlacementError
@@ -50,7 +50,7 @@ def assert_sycl_queue_equal(result, expected):
     assert result.sycl_device == expected.sycl_device
     assert result.is_in_order == expected.is_in_order
     assert result.has_enable_profiling == expected.has_enable_profiling
-    exec_queue = dpctl.utils.get_execution_queue([result, expected])
+    exec_queue = dpt.get_execution_queue([result, expected])
     assert exec_queue is not None
 
 
diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py
index b73eb67d51ee..568cf2a2aff0 100644
--- a/dpnp/tests/test_usm_type.py
+++ b/dpnp/tests/test_usm_type.py
@@ -2,12 +2,11 @@
 import tempfile
 from math import prod
 
-import dpctl.tensor as dpt
-import dpctl.utils as du
 import numpy
 import pytest
 
 import dpnp
+import dpnp.tensor as dpt
 from dpnp.dpnp_utils import get_usm_allocations
 
 from .helper import generate_random_numpy_array
@@ -29,7 +28,7 @@ def test_add(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -46,7 +45,7 @@ def test_multiply(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -63,7 +62,7 @@ def test_subtract(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -80,7 +79,7 @@ def test_divide(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -100,7 +99,7 @@ def test_remainder(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -121,7 +120,7 @@ def test_floor_divide(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -136,7 +135,7 @@ def test_power(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize(
@@ -320,7 +319,7 @@ def test_linspace_arrays(usm_type_start, usm_type_stop):
     start = dpnp.array([0, 0], usm_type=usm_type_start)
     stop = dpnp.array([2, 4], usm_type=usm_type_stop)
     res = dpnp.linspace(start, stop, 4)
-    assert res.usm_type == du.get_coerced_usm_type(
+    assert res.usm_type == dpt.get_coerced_usm_type(
         [usm_type_start, usm_type_stop]
     )
 
@@ -376,7 +375,7 @@ def test_logic_op_2in(op, usm_type_x, usm_type_y):
 
     assert x.usm_type == zx.usm_type == usm_type_x
     assert y.usm_type == zy.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("op", ["bitwise_count", "bitwise_not"])
@@ -404,7 +403,7 @@ def test_bitwise_op_2in(op, usm_type_x, usm_type_y):
 
     assert x.usm_type == zx.usm_type == usm_type_x
     assert y.usm_type == zy.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 class TestMatmul:
@@ -445,7 +444,7 @@ def test_basic(self, usm_type_x, usm_type_y, dtype, shape1, shape2):
 
         assert x.usm_type == usm_type_x
         assert y.usm_type == usm_type_y
-        assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+        assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
     @pytest.mark.parametrize("usm_type", list_of_usm_types)
     def test_syrk(self, usm_type):
@@ -474,7 +473,7 @@ def test_matvec(usm_type_x, usm_type_y, shape1, shape2):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -496,7 +495,7 @@ def test_vecdot(usm_type_x, usm_type_y, shape1, shape2):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -518,7 +517,7 @@ def test_vecmat(usm_type_x, usm_type_y, shape1, shape2):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -744,7 +743,7 @@ def test_2in_1out(func, data1, data2, usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize(
@@ -765,7 +764,7 @@ def test_2in_2out(func, data1, data2, usm_type_x, usm_type_y):
     assert (
         z1.usm_type
         == z2.usm_type
-        == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+        == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
     )
 
 
@@ -811,7 +810,7 @@ def test_piecewise(usm_type_x, usm_type_y, usm_type_z):
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
     assert z.usm_type == usm_type_z
-    assert result.usm_type == du.get_coerced_usm_type(
+    assert result.usm_type == dpt.get_coerced_usm_type(
         [usm_type_x, usm_type_y, usm_type_z]
     )
 
@@ -836,7 +835,7 @@ def test_concat_stack(func, data1, data2, usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -848,7 +847,7 @@ def test_extract(usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize(
@@ -896,7 +895,9 @@ def test_obj_ndarray(self, usm_type, usm_type_other):
 
         assert x.usm_type == usm_type
         assert y.usm_type == usm_type_other
-        assert z.usm_type == du.get_coerced_usm_type([usm_type, usm_type_other])
+        assert z.usm_type == dpt.get_coerced_usm_type(
+            [usm_type, usm_type_other]
+        )
 
 
 @pytest.mark.parametrize("usm_type", list_of_usm_types)
@@ -941,7 +942,9 @@ def test_values_ndarray(self, obj, usm_type, usm_type_other):
 
         assert x.usm_type == usm_type
         assert y.usm_type == usm_type_other
-        assert z.usm_type == du.get_coerced_usm_type([usm_type, usm_type_other])
+        assert z.usm_type == dpt.get_coerced_usm_type(
+            [usm_type, usm_type_other]
+        )
 
     @pytest.mark.parametrize("values", [-2, [-1, -2]], ids=["scalar", "list"])
     @pytest.mark.parametrize("usm_type_other", list_of_usm_types)
@@ -952,7 +955,9 @@ def test_obj_ndarray(self, values, usm_type, usm_type_other):
 
         assert x.usm_type == usm_type
         assert y.usm_type == usm_type_other
-        assert z.usm_type == du.get_coerced_usm_type([usm_type, usm_type_other])
+        assert z.usm_type == dpt.get_coerced_usm_type(
+            [usm_type, usm_type_other]
+        )
 
     @pytest.mark.parametrize("usm_type_y", list_of_usm_types)
     @pytest.mark.parametrize("usm_type_z", list_of_usm_types)
@@ -965,7 +970,7 @@ def test_obj_values_ndarray(self, usm_type, usm_type_y, usm_type_z):
         assert x.usm_type == usm_type
         assert y.usm_type == usm_type_y
         assert z.usm_type == usm_type_z
-        assert res.usm_type == du.get_coerced_usm_type(
+        assert res.usm_type == dpt.get_coerced_usm_type(
             [usm_type, usm_type_y, usm_type_z]
         )
 
@@ -980,7 +985,7 @@ def test_take(func, usm_type_x, usm_type_ind):
 
     assert x.usm_type == usm_type_x
     assert ind.usm_type == usm_type_ind
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_ind])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_ind])
 
 
 @pytest.mark.parametrize(
@@ -1004,7 +1009,7 @@ def test_take_along_axis(data, ind, axis, usm_type_x, usm_type_ind):
 
     assert x.usm_type == usm_type_x
     assert ind.usm_type == usm_type_ind
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_ind])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_ind])
 
 
 @pytest.mark.parametrize("usm_type", list_of_usm_types + [None])
@@ -1156,8 +1161,8 @@ def test_histogram(usm_type_v, usm_type_w):
     hist, edges = dpnp.histogram(v, weights=w)
     assert v.usm_type == usm_type_v
     assert w.usm_type == usm_type_w
-    assert hist.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
-    assert edges.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
+    assert hist.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w])
+    assert edges.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -1172,13 +1177,13 @@ def test_histogram2d(usm_type_x, usm_type_y, usm_type_w):
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
     assert w.usm_type == usm_type_w
-    assert hist.usm_type == du.get_coerced_usm_type(
+    assert hist.usm_type == dpt.get_coerced_usm_type(
         [usm_type_x, usm_type_y, usm_type_w]
     )
-    assert edges_x.usm_type == du.get_coerced_usm_type(
+    assert edges_x.usm_type == dpt.get_coerced_usm_type(
         [usm_type_x, usm_type_y, usm_type_w]
     )
-    assert edges_y.usm_type == du.get_coerced_usm_type(
+    assert edges_y.usm_type == dpt.get_coerced_usm_type(
         [usm_type_x, usm_type_y, usm_type_w]
     )
 
@@ -1192,7 +1197,7 @@ def test_bincount(usm_type_v, usm_type_w):
     hist = dpnp.bincount(v, weights=w)
     assert v.usm_type == usm_type_v
     assert w.usm_type == usm_type_w
-    assert hist.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
+    assert hist.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w])
 
 
 @pytest.mark.parametrize("usm_type_v", list_of_usm_types)
@@ -1204,9 +1209,9 @@ def test_histogramdd(usm_type_v, usm_type_w):
     hist, edges = dpnp.histogramdd(v, weights=w)
     assert v.usm_type == usm_type_v
     assert w.usm_type == usm_type_w
-    assert hist.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
+    assert hist.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w])
     for e in edges:
-        assert e.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
+        assert e.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w])
 
 
 @pytest.mark.parametrize(
@@ -1247,7 +1252,7 @@ def test_histogram_bin_edges(usm_type_v, usm_type_w):
     edges = dpnp.histogram_bin_edges(v, weights=w)
     assert v.usm_type == usm_type_v
     assert w.usm_type == usm_type_w
-    assert edges.usm_type == du.get_coerced_usm_type([usm_type_v, usm_type_w])
+    assert edges.usm_type == dpt.get_coerced_usm_type([usm_type_v, usm_type_w])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types)
@@ -1256,7 +1261,7 @@ def test_select(usm_type_x, usm_type_y):
     condlist = [dpnp.array([True, False], usm_type=usm_type_x)]
     choicelist = [dpnp.array([1, 2], usm_type=usm_type_y)]
     res = dpnp.select(condlist, choicelist)
-    assert res.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+    assert res.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("axis", [None, 0, -1])
@@ -1300,7 +1305,7 @@ def test_ediff1d(usm_type_x, usm_type_args, to_end, to_begin):
 
     res = dpnp.ediff1d(x, to_end=to_end, to_begin=to_begin)
 
-    assert res.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_args])
+    assert res.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_args])
 
 
 @pytest.mark.parametrize("usm_type", list_of_usm_types)
@@ -1337,7 +1342,7 @@ def test_choose(usm_type_x, usm_type_ind):
 
     assert chc.usm_type == usm_type_x
     assert ind.usm_type == usm_type_ind
-    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_ind])
+    assert z.usm_type == dpt.get_coerced_usm_type([usm_type_x, usm_type_ind])
 
 
 @pytest.mark.parametrize(
@@ -1371,7 +1376,7 @@ def test_basic(self, usm_type_x, usm_type_xp, usm_type_fp):
         assert x.usm_type == usm_type_x
         assert xp.usm_type == usm_type_xp
         assert fp.usm_type == usm_type_fp
-        assert result.usm_type == du.get_coerced_usm_type(
+        assert result.usm_type == dpt.get_coerced_usm_type(
             [usm_type_x, usm_type_xp, usm_type_fp]
         )
 
@@ -1390,7 +1395,7 @@ def test_left_right(self, usm_type_x, usm_type_left, usm_type_right):
 
         assert left.usm_type == usm_type_left
         assert right.usm_type == usm_type_right
-        assert result.usm_type == du.get_coerced_usm_type(
+        assert result.usm_type == dpt.get_coerced_usm_type(
             [
                 x.usm_type,
                 xp.usm_type,
@@ -1523,7 +1528,7 @@ def test_lstsq(self, m, n, nrhs, usm_type, usm_type_other):
         assert a.usm_type == usm_type
         assert b.usm_type == usm_type_other
         for param in result:
-            assert param.usm_type == du.get_coerced_usm_type(
+            assert param.usm_type == dpt.get_coerced_usm_type(
                 [usm_type, usm_type_other]
             )
 
@@ -1570,7 +1575,7 @@ def test_lu_solve(self, a_data, b_data, usm_type, usm_type_rhs):
 
         assert lu.usm_type == usm_type
         assert b.usm_type == usm_type_rhs
-        assert result.usm_type == du.get_coerced_usm_type(
+        assert result.usm_type == dpt.get_coerced_usm_type(
             [usm_type, usm_type_rhs]
         )
 
@@ -1730,7 +1735,7 @@ def test_solve(self, matrix, rhs, usm_type, usm_type_rhs):
 
         assert x.usm_type == usm_type
         assert y.usm_type == usm_type_rhs
-        assert z.usm_type == du.get_coerced_usm_type([usm_type, usm_type_rhs])
+        assert z.usm_type == dpt.get_coerced_usm_type([usm_type, usm_type_rhs])
 
     @pytest.mark.parametrize("full_matrices_param", [True, False])
     @pytest.mark.parametrize("compute_uv_param", [True, False])
@@ -1796,6 +1801,6 @@ def test_tensorsolve(self, usm_type, usm_type_other):
 
         assert a.usm_type == usm_type
         assert b.usm_type == usm_type_other
-        assert result.usm_type == du.get_coerced_usm_type(
+        assert result.usm_type == dpt.get_coerced_usm_type(
             [usm_type, usm_type_other]
         )
diff --git a/dpnp/tests/test_utils.py b/dpnp/tests/test_utils.py
index eef9132e5b55..aef6abba8726 100644
--- a/dpnp/tests/test_utils.py
+++ b/dpnp/tests/test_utils.py
@@ -1,8 +1,8 @@
-import dpctl.tensor as dpt
 import numpy
 import pytest
 
 import dpnp
+import dpnp.tensor as dpt
 
 
 class TestIsSupportedArrayOrScalar:
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
index 41df0a82e0a0..eb9e958fad0b 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
@@ -1,11 +1,11 @@
 from __future__ import annotations
 
 import dpctl
-import dpctl.tensor._dlpack as dlp
 import numpy
 import pytest
 
 import dpnp as cupy
+import dpnp.tensor._dlpack as dlp
 from dpnp.tests.third_party.cupy import testing
 
 
diff --git a/dpnp/tests/third_party/cupy/indexing_tests/test_insert.py b/dpnp/tests/third_party/cupy/indexing_tests/test_insert.py
index 7399343e7e57..3b23b32fe3b2 100644
--- a/dpnp/tests/third_party/cupy/indexing_tests/test_insert.py
+++ b/dpnp/tests/third_party/cupy/indexing_tests/test_insert.py
@@ -84,7 +84,7 @@ def test_put(self, xp, dtype):
         # Take care so that actual indices don't overlap.
         if self.mode == "raise":
             pytest.skip("'raise' mode is not supported")
-        # `wrap` mode in dpctl.tensor.put is different from numpy.put (#1365):
+        # `wrap` mode in dpnp.tensor.put is different from numpy.put (#1365):
         # numpy`s `wrap` mode wraps indices around for cyclic operations
         # while dpctl`s `wrap` mode restricts indices to stay within the array bounds (-n <= i < n).
         elif self.mode == "wrap":
diff --git a/pyproject.toml b/pyproject.toml
index 78ebe9d9aa66..02567d2f25ad 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314']
 [tool.codespell]
 builtin = "clear,rare,informal,names"
 check-filenames = true
-ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart"
+ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT,OffsetT,inpT, wit"
 quiet-level = 3
 
 [tool.coverage.report]
@@ -134,13 +134,21 @@ source = [
 ensure_newline_before_comments = true
 force_grid_wrap = 0
 include_trailing_comma = true
+known_third_party = ["dpctl"]
 line_length = 80
 multi_line_output = 3
+profile = "black"
 skip = ["dpnp/__init__.py"]
 split_on_trailing_comma = true
 use_parentheses = true
 
 [tool.pylint.basic]
+disable = [
+  "wrong-import-order",
+  "ungrouped-imports",
+  "wrong-import-position"
+]
+ignored-modules = ["dpctl", "dpctl.*"]
 include-naming-hint = true
 
 [tool.pylint.classes]
diff --git a/setup.py b/setup.py
index cc21221299c4..3f5449663508 100644
--- a/setup.py
+++ b/setup.py
@@ -34,6 +34,7 @@
     cmdclass=versioneer.get_cmdclass(),
     packages=[
         "dpnp",
+        "dpnp.tensor",
         "dpnp.dpnp_algo",
         "dpnp.dpnp_utils",
         "dpnp.exceptions",
@@ -52,12 +53,14 @@
             "dpnp_backend_c.lib",
             "dpnp_backend_c.dll",
             "tests/*.*",
+            "tests/tensor/*.py",
+            "tests/tensor/*/*.py",
             "tests/testing/*.py",
             "tests/third_party/cupy/*.py",
             "tests/third_party/cupy/*/*.py",
             "tests/third_party/cupyx/*.py",
             "tests/third_party/cupyx/*/*.py",
-        ]
+        ],
     },
     include_package_data=False,
 )